diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan.yaml new file mode 100644 index 000000000..032547552 --- /dev/null +++ b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan.yaml @@ -0,0 +1,814 @@ +run_name: peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # MetaMathQA (87M tokens) + - s3://ai2-llm/preprocessed/meta-math_MetaMathQA/v0/tokens/allenai/dolma2-tokenizer/part-0-00000.npy + + # Mathpile (4.9B tokens) + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/wikipedia/allenai/dolma2-tokenizer/part-0-00000.npy + + # AutoMathText (43.5B tokens) + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00009.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00010.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-3-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data filtered using NVIDIA quality classifier (1,150,819,744,347 total, 145,778,821,164 taken) + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-087-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-084-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-089-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-086-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-082-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-081-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-085-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-108-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-080-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-088-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-083-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-008-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-122-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-089-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-082-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-080-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-085-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-108-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-108-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-085-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-088-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-088-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-087-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-086-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-118-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-080-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-008-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-087-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-081-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-089-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-086-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-081-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-083-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-008-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-020-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-084-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-082-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-083-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-084-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-077-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-072-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-075-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-078-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-074-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-076-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-077-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-079-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/full/allenai/dolma2-tokenizer/part-071-00000.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + + # GSM8K + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + + # CodeSearchNet + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # Flan + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + + # StackExchange + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2.yaml new file mode 100644 index 000000000..8f2d9250b --- /dev/null +++ b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2.yaml @@ -0,0 +1,2750 @@ +run_name: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2 +seed: 6209 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # MetaMathQA (87M tokens) + - s3://ai2-llm/preprocessed/meta-math_MetaMathQA/v0/tokens/allenai/dolma2-tokenizer/part-0-00000.npy + + # Mathpile (4.9B tokens) + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/wikipedia/allenai/dolma2-tokenizer/part-0-00000.npy + + # AutoMathText (43.5B tokens) + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00009.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00010.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-3-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data filtered to the top 7% AND with fineweb classifier >=2 (751,778,760,196 tokens) + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-63-00000.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + + # GSM8K + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + + # CodeSearchNet + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # Flan + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + + # StackExchange + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3.yaml new file mode 100644 index 000000000..ca3054e13 --- /dev/null +++ b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3.yaml @@ -0,0 +1,2750 @@ +run_name: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3 +seed: 2662 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # MetaMathQA (87M tokens) + - s3://ai2-llm/preprocessed/meta-math_MetaMathQA/v0/tokens/allenai/dolma2-tokenizer/part-0-00000.npy + + # Mathpile (4.9B tokens) + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/wikipedia/allenai/dolma2-tokenizer/part-0-00000.npy + + # AutoMathText (43.5B tokens) + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00009.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00010.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-3-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data filtered to the top 7% AND with fineweb classifier >=2 (751,778,760,196 tokens) + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-63-00000.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + + # GSM8K + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + + # CodeSearchNet + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # Flan + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + + # StackExchange + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4.yaml new file mode 100644 index 000000000..3322cb641 --- /dev/null +++ b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4.yaml @@ -0,0 +1,2750 @@ +run_name: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4 +seed: 1110 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # MetaMathQA (87M tokens) + - s3://ai2-llm/preprocessed/meta-math_MetaMathQA/v0/tokens/allenai/dolma2-tokenizer/part-0-00000.npy + + # Mathpile (4.9B tokens) + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/wikipedia/allenai/dolma2-tokenizer/part-0-00000.npy + + # AutoMathText (43.5B tokens) + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00009.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00010.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-3-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data filtered to the top 7% AND with fineweb classifier >=2 (751,778,760,196 tokens) + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-63-00000.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + + # GSM8K + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + + # CodeSearchNet + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # Flan + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + + # StackExchange + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5.yaml new file mode 100644 index 000000000..59e72eb18 --- /dev/null +++ b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5.yaml @@ -0,0 +1,2750 @@ +run_name: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5 +seed: 4095 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # MetaMathQA (87M tokens) + - s3://ai2-llm/preprocessed/meta-math_MetaMathQA/v0/tokens/allenai/dolma2-tokenizer/part-0-00000.npy + + # Mathpile (4.9B tokens) + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-0-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-1-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-2-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00007.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00008.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00009.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/arXiv/allenai/dolma2-tokenizer/part-3-00010.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00002.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00003.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00004.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00005.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-0-00006.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/commoncrawl/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/proofwiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/stackexchange/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-0-00001.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/textbooks/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/mathpile/MathPile_Commercial/train/wikipedia/allenai/dolma2-tokenizer/part-0-00000.npy + + # AutoMathText (43.5B tokens) + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0000/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0001/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-1-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-2-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0002/part-3-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer/0003/part-3-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-00-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-01-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-02-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-03-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00009.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-04-00010.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-05-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-07-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-08-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-09-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-10-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00005.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00006.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00007.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-11-00008.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-12-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-13-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-14-00004.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer/0000/part-15-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00002.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-0-00003.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-1-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00000.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-2-00001.npy + - s3://ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer/0000/part-3-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + #- /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data filtered to the top 7% AND with fineweb classifier >=2 (751,778,760,196 tokens) + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-00-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-01-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-02-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-03-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-04-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-07-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-08-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-09-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-10-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-11-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-12-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-13-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-14-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-16-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-17-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-20-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-22-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-23-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-24-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-26-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-27-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-28-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-29-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-31-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-32-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-33-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-35-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-37-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-39-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-40-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-43-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-44-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-45-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-46-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-47-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-48-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-49-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-50-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-51-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-53-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-54-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-55-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-56-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-57-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-58-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-59-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-61-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-62-00000.npy + # - /weka/oe-training-default/ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-63-00000.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + + # GSM8K + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + + # CodeSearchNet + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # Flan + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + + # StackExchange + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_big-number-no-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_big-number-no-whammy-2.yaml new file mode 100644 index 000000000..1c3e905c9 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_big-number-no-whammy-2.yaml @@ -0,0 +1,1699 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_100B +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (14.43BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/shadow_clones/ (36.35BT) + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/metamath/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/personahub_math_v2_79975/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/shadow_clones/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (51.37BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_legal-whammy-2.yaml new file mode 100644 index 000000000..546283960 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_legal-whammy-2.yaml @@ -0,0 +1,1537 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2-100b +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (9.76BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (51.47BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-18-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-300B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-300B-nowup_legal-whammy-2.yaml new file mode 100644 index 000000000..60ea58cc5 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-300B-nowup_legal-whammy-2.yaml @@ -0,0 +1,2792 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2-300b +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (58.55BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (156.26BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml new file mode 100644 index 000000000..7a0210043 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml @@ -0,0 +1,816 @@ +run_name: peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/ (1.65BT) + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/ (19.86BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-31-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-41-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-47-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-60-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-71-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-77-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-51-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-65-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-24-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-79-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-76-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-25-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-82-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-88-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-50-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-39-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-85-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-42-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-27-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-86-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-90-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-23-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-28-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-26-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-53-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-52-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-78-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-83-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-74-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-66-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-87-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-57-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-73-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-68-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-91-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-36-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-56-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-33-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-72-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-81-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-55-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-89-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-80-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-49-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-75-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-59-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-70-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-40-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-34-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-45-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-58-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-67-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-29-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-37-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-69-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-84-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-62-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/ (4.80BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/ (7.50BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-31-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-29-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-25-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-47-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-23-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-33-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-24-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-41-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-51-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-40-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-65-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-57-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-56-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-59-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-58-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-60-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-42-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-45-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-36-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-50-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-62-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-26-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-39-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-34-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-37-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-52-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-55-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-28-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-49-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-27-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (12.20BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-56-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan.yaml new file mode 100644 index 000000000..90d9725bd --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan.yaml @@ -0,0 +1,1043 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (90.64BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-1-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (23.48BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/ (1.65BT) + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/ (19.86BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-55-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-71-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-34-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-57-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-50-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-62-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-39-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-58-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-87-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-74-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-60-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-42-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-82-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-27-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-28-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-31-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-85-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-66-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-89-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-68-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-88-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-23-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-81-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-59-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-73-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-26-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-84-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-36-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-51-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-65-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-49-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-33-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-40-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-37-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-91-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-72-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-41-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-53-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-86-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-25-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-52-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-56-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-75-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-80-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-29-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-67-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-47-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-79-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-69-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-77-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-76-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-70-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-78-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-24-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-83-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-90-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-45-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/ (4.80BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/ (7.50BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-60-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-49-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-39-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-27-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-52-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-45-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-65-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-40-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-57-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-33-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-23-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-34-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-56-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-29-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-37-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-24-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-28-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-53-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-36-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-50-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-51-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-26-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-42-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-62-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-47-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-59-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-41-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-31-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-58-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-55-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-25-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/ (333.59MT) + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2.yaml new file mode 100644 index 000000000..3463736fa --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2.yaml @@ -0,0 +1,750 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.55BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (5.87BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.28BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666.yaml new file mode 100644 index 000000000..f299e9bc0 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666.yaml @@ -0,0 +1,750 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666 +seed: 666 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.55BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (5.87BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.28BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777.yaml new file mode 100644 index 000000000..769e4c3b1 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777.yaml @@ -0,0 +1,750 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777 +seed: 777 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.55BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (5.87BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.28BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-48-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3.yaml new file mode 100644 index 000000000..0598f1bf9 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3.yaml @@ -0,0 +1,1116 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.68BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.90BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/ (1.59BT) + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-160-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-130-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-057-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-076-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-177-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-113-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-005-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-172-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-189-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-056-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-066-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-041-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-094-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-029-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-173-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-039-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-126-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-164-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-183-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-021-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-134-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-137-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-033-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-171-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-166-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-187-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-023-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-063-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-131-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-153-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-185-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-178-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-127-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-098-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-146-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-037-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-079-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-084-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-026-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-190-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-104-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-019-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-124-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-132-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-191-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-139-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-064-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-054-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-120-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-148-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-145-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-091-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-149-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-013-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-072-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-138-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-129-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-174-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-107-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-140-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-061-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-155-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-065-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-151-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-162-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-095-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-032-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-181-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-080-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-179-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-163-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-102-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-144-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-168-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-147-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-002-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-169-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-059-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-049-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-180-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-141-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-167-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-036-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-024-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-152-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-182-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-186-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-069-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-156-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-123-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-143-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-175-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-089-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-101-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-136-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-114-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-125-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-176-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-142-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-022-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-157-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-003-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-135-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-159-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-085-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-165-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-161-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-044-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-062-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-188-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-154-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-128-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-087-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-170-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-109-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-133-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-075-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-150-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-158-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-184-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-025-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-000-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (27.38BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337.yaml new file mode 100644 index 000000000..872487d39 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337.yaml @@ -0,0 +1,1116 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3 +seed: 1337 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.68BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.90BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/ (1.59BT) + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-160-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-130-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-057-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-076-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-177-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-113-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-005-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-172-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-189-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-056-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-066-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-041-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-094-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-029-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-173-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-039-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-126-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-164-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-183-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-021-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-134-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-137-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-033-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-171-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-166-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-187-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-023-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-063-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-131-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-153-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-185-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-178-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-127-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-098-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-146-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-037-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-079-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-084-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-026-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-190-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-104-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-019-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-124-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-132-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-191-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-139-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-064-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-054-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-120-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-148-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-145-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-091-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-149-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-013-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-072-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-138-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-129-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-174-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-107-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-140-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-061-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-155-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-065-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-151-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-162-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-095-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-032-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-181-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-080-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-179-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-163-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-102-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-144-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-168-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-147-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-002-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-169-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-059-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-049-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-180-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-141-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-167-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-036-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-024-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-152-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-182-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-186-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-069-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-156-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-123-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-143-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-175-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-089-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-101-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-136-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-114-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-125-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-176-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-142-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-022-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-157-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-003-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-135-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-159-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-085-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-165-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-161-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-044-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-062-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-188-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-154-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-128-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-087-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-170-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-109-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-133-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-075-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-150-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-158-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-184-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-025-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-000-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (27.38BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42.yaml new file mode 100644 index 000000000..5f47dce17 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42.yaml @@ -0,0 +1,1116 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3-seed42 +seed: 42 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.68BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.90BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/ (1.59BT) + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-160-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-130-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-057-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-076-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-177-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-113-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-005-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-172-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-189-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-056-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-066-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-041-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-094-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-029-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-173-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-039-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-126-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-164-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-183-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-021-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-134-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-137-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-033-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-171-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-166-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-187-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-023-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-063-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-131-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-153-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-185-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-178-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-127-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-098-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-146-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-037-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-079-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-084-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-026-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-190-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-104-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-019-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-124-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-132-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-191-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-139-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-064-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-054-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-120-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-148-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-145-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-091-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-149-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-013-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-072-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-138-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-129-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-174-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-107-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-140-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-061-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-155-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-065-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-151-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-162-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-095-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-032-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-181-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-080-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-179-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-163-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-102-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-144-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-168-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-147-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-002-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-169-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-059-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-049-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-180-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-141-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-167-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-036-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-024-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-152-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-182-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-186-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-069-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-156-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-123-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-143-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-175-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-089-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-101-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-136-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-114-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-125-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-176-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-142-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-022-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-157-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-003-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-135-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-159-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-085-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-165-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-161-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-044-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-062-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-188-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-154-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-128-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-087-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-170-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-109-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-133-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-075-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-150-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-158-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-184-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-025-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-000-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (27.38BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42069.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42069.yaml new file mode 100644 index 000000000..56e58845a --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42069.yaml @@ -0,0 +1,1116 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3 +seed: 42069 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.68BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.90BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/ (1.59BT) + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-160-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-130-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-057-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-076-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-177-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-113-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-005-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-172-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-189-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-056-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-066-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-041-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-094-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-029-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-173-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-039-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-126-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-164-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-183-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-021-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-134-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-137-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-033-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-171-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-166-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-187-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-023-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-063-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-131-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-153-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-185-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-178-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-127-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-098-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-146-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-037-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-079-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-084-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-026-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-190-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-104-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-019-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-124-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-132-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-191-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-139-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-064-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-054-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-120-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-148-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-145-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-091-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-149-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-013-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-072-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-138-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-129-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-174-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-107-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-140-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-061-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-155-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-065-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-151-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-162-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-095-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-032-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-181-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-080-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-179-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-163-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-102-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-144-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-168-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-147-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-002-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-169-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-059-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-049-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-180-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-141-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-167-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-036-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-024-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-152-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-182-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-186-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-069-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-156-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-123-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-143-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-175-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-089-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-101-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-136-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-114-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-125-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-176-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-142-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-022-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-157-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-003-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-135-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-159-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-085-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-165-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-161-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-044-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-062-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-188-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-154-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-128-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-087-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-170-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-109-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-133-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-075-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-150-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-158-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-184-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-025-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa/tokenized/dolma-merged-qa-mmlu-topics/part-000-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (27.38BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-41-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy.yaml new file mode 100644 index 000000000..b96e3ad3f --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy.yaml @@ -0,0 +1,754 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.65BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (5.71BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (25.63BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-49-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2.yaml new file mode 100644 index 000000000..6adf679ac --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2.yaml @@ -0,0 +1,843 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.54BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.01BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.31BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42.yaml new file mode 100644 index 000000000..d15135023 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42.yaml @@ -0,0 +1,843 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42 +seed: 42 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.54BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.01BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.31BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069.yaml new file mode 100644 index 000000000..52b971e68 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069.yaml @@ -0,0 +1,843 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069 +seed: 42069 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.54BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.01BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.31BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy \ No newline at end of file diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666.yaml new file mode 100644 index 000000000..ce2629772 --- /dev/null +++ b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666.yaml @@ -0,0 +1,843 @@ +run_name: peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666 +seed: 666 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (8.54BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (3.01BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (24.31BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_automathtext.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_automathtext.yaml new file mode 100644 index 000000000..6697e091f --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_automathtext.yaml @@ -0,0 +1,298 @@ +run_name: peteish7-weka-microanneal-from928646_automathtext +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/ (5.23BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-76-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-47-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-83-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-74-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-45-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-53-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-80-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-75-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-72-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-86-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-23-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-62-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-26-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (6.07BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_bestof-v1.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_bestof-v1.yaml new file mode 100644 index 000000000..0e12fde76 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_bestof-v1.yaml @@ -0,0 +1,636 @@ +run_name: peteish7-weka-microanneal-from928646_bestof-v1 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (7.25BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-51-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_mathcoder-synthetic.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_mathcoder-synthetic.yaml new file mode 100644 index 000000000..9e371d0f0 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_mathcoder-synthetic.yaml @@ -0,0 +1,362 @@ +run_name: peteish7-weka-microanneal-from928646_mathcoder-synthetic +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (3.46BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-16-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_mathpile.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_mathpile.yaml new file mode 100644 index 000000000..f91c731c5 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_mathpile.yaml @@ -0,0 +1,313 @@ +run_name: peteish7-weka-microanneal-from928646_mathpile +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/ (5.19BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-65-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-33-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-54-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-32-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-37-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-38-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-59-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-46-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-49-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-35-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-44-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-24-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-43-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-34-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-28-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-60-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-30-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-45-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-58-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-50-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-39-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-53-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-29-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-61-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-40-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-51-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-52-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-41-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-48-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-64-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-63-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-25-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/mathpile-commercial/part-26-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (4.52BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-34-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_owm.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_owm.yaml new file mode 100644 index 000000000..949491830 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_owm.yaml @@ -0,0 +1,275 @@ +run_name: peteish7-weka-microanneal-from928646_owm +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/ (4.80BT) + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/open-web-math/part-09-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (4.58BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0025/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-05-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_pes2o.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_pes2o.yaml new file mode 100644 index 000000000..b2bb5ecec --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_pes2o.yaml @@ -0,0 +1,267 @@ +run_name: peteish7-weka-microanneal-from928646_pes2o +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/ (6.01BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (5.68BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-01-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-qa-all.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-qa-all.yaml new file mode 100644 index 000000000..5a0a12385 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-qa-all.yaml @@ -0,0 +1,460 @@ +run_name: peteish7-weka-microanneal-from928646_reddit-qa-all +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/ (6.42BT) + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-124-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-148-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-179-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-176-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-128-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-109-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-130-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-150-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-158-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-151-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-023-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-173-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-000-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-091-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-123-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-002-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-101-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-024-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-162-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-172-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-135-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-032-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-107-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-054-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-084-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-163-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-164-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-153-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-003-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-127-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-142-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-175-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-139-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-062-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-140-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-170-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-104-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-075-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-149-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-061-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-094-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-183-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-076-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-165-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-143-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-049-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-159-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-160-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-187-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-069-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-168-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-029-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-063-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-095-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-191-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-132-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-157-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-155-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-102-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-156-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-114-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-026-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-174-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-039-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-188-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-166-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-177-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-120-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-145-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-154-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-184-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-161-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-019-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-169-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-134-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-186-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-133-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-190-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-044-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-056-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-013-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-182-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-098-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-059-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-057-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-087-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-131-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-167-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-136-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-126-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-005-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-125-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-066-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-185-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-041-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-072-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-113-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-178-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-037-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-171-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-189-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-065-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-144-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-089-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-080-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-146-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-085-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-180-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-129-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-181-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-064-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-022-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-079-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-036-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-033-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-141-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-025-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-137-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-147-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-021-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-138-00000.npy + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/merged_versions/merged_qa_all/tokenized/dolma-merged-qa-all-mmlu-topics/part-152-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (6.73BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0011/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-57-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-v1.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-v1.yaml new file mode 100644 index 000000000..f94468d3a --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_reddit-v1.yaml @@ -0,0 +1,346 @@ +run_name: peteish7-weka-microanneal-from928646_reddit-v1 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/ (1.59BT) + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/reddit/dolma_raw/merged_versions/merged_qa/thread-mix-mmlu-topics/dolma2-tokenizer/part-68-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (1.51BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-15-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_se.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_se.yaml new file mode 100644 index 000000000..eefc23563 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_se.yaml @@ -0,0 +1,270 @@ +run_name: peteish7-weka-microanneal-from928646_se +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (1.48BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-21-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-big-dclm.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-big-dclm.yaml new file mode 100644 index 000000000..0c6daf68f --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-big-dclm.yaml @@ -0,0 +1,424 @@ +run_name: peteish7-weka-microanneal-from928646_small-sources-big-dclm +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/ (2.11MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd2/dolma2-tokenizer/part-3-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (1.52BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-multiadd.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-multiadd.yaml new file mode 100644 index 000000000..5e372184a --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources-multiadd.yaml @@ -0,0 +1,421 @@ +run_name: peteish7-weka-microanneal-from928646_small-sources-multiadd +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (374.43MT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-07-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources.yaml new file mode 100644 index 000000000..efca8f91e --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources.yaml @@ -0,0 +1,414 @@ +run_name: peteish7-weka-microanneal-from928646_small-sources +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (364.02MT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-25-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x2.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x2.yaml new file mode 100644 index 000000000..d52fc8437 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x2.yaml @@ -0,0 +1,578 @@ +run_name: peteish7-weka-microanneal-from928646_small-sources_x2 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (393.66MT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-26-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x4.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x4.yaml new file mode 100644 index 000000000..6a280e04e --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_small-sources_x4.yaml @@ -0,0 +1,907 @@ +run_name: peteish7-weka-microanneal-from928646_small-sources_x4 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/personahub_math_v2_79975/ (84.52MT) + - s3://ai2-llm/preprocessed/personahub_math_v2_79975/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/ (1.78MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-13-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-19-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-00-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-18-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-21-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-01-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-07-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-06-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-11-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-04-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-03-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-12-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-08-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-15-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-17-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-05-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-14-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-20-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-09-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-10-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-02-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-22-00000.npy + - s3://ai2-llm/preprocessed/owm-filtered-math/codesearchnet/part-16-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow01/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow01/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow02/ (84.52MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/personahub_math_v2_79975_shadow02/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow00/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow01/part-01-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/ (9.03MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/basic_math_mj/dolma2-tokenizer_shadow02/part-31-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow01/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow01/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow02/ (1.08MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k-synth/resample_v1_6x/dolma2-tokenizer_shadow02/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-53-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow00/part-78-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-53-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow01/part-03-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/ (17.06MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-48-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm_MIND/clean_stop/dolma2-tokenizer_shadow02/part-53-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow01/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow01/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow02/ (1.23MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_main_train/allenai/dolma2-tokenizer_shadow02/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow01/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow01/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow02/ (1.51MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer_shadow02/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow00/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow01/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow01/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow02/ (84.22MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/metamath_shadow02/part-0-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow00/part-22-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow01/part-09-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/ (1.78MT) + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/owm-filtered-math/codesearchnet_shadow02/part-08-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (764.66MT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-55-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-inline.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-inline.yaml new file mode 100644 index 000000000..2280614b7 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-inline.yaml @@ -0,0 +1,346 @@ +run_name: peteish7-weka-microanneal-from928646_tiny-gsm-inline +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/ (1.65BT) + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/inline_comments/dolma2-tokenizer/part-80-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (1.52BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0021/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-61-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind.yaml new file mode 100644 index 000000000..e4caee905 --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind.yaml @@ -0,0 +1,351 @@ +run_name: peteish7-weka-microanneal-from928646_tiny-gsm-mind +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (3.34BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-57-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind_x2.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind_x2.yaml new file mode 100644 index 000000000..7d24ad30a --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-from928646_tiny-gsm-mind_x2.yaml @@ -0,0 +1,452 @@ +run_name: peteish7-weka-microanneal-from928646_tiny-gsm-mind_x2 +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/ (3.06BT) + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-22-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-62-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-43-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-12-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-91-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-72-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-37-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-53-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-78-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-46-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-24-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-04-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-40-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-56-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-39-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-01-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-76-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-42-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-18-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-32-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-60-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-15-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-35-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-81-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-70-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-89-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-74-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-65-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-29-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-49-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-26-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-31-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-88-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-09-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-45-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-38-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-64-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-68-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-71-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-47-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-00-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-82-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-66-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-10-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-08-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-11-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-06-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-79-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-73-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-07-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-30-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-80-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-16-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-61-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-27-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-44-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-02-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-34-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-86-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-54-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-55-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-03-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-23-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-14-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-52-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-69-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-13-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-90-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-33-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-67-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-58-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-75-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-77-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-51-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-59-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-36-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-05-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-19-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-83-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-21-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-25-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-87-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-41-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-84-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-63-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-17-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-85-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-50-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-20-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-28-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-57-00000.npy + - s3://ai2-llm/preprocessed/math_shadow_clones/tinyGSM/mind/dolma2-tokenizer_shadow00/part-48-00000.npy + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (6.46BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-17-00000.npy \ No newline at end of file diff --git a/configs/microannealing/peteish7-weka-microanneal-fromMegaMath1.1-5000_nonmath.yaml b/configs/microannealing/peteish7-weka-microanneal-fromMegaMath1.1-5000_nonmath.yaml new file mode 100644 index 000000000..32fc7f43f --- /dev/null +++ b/configs/microannealing/peteish7-weka-microanneal-fromMegaMath1.1-5000_nonmath.yaml @@ -0,0 +1,263 @@ +run_name: peteish7-weka-microanneal-fromMegaMath1.1-5000_nonmath +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000035726 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml/step5000/ + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (5.32BT) + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0019/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0012/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-21-00000.npy \ No newline at end of file diff --git a/scripts/beaker/microannealing/launch_microannealing_peteish7.sh b/scripts/beaker/microannealing/launch_microannealing_peteish7.sh new file mode 100755 index 000000000..9abc3c5e8 --- /dev/null +++ b/scripts/beaker/microannealing/launch_microannealing_peteish7.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Launcher for peteish7 microannealing runs on Jupiter. + +set -ex + +CONFIG_NAME=$1 +NUM_NODES=$2 +PRIORITY=$3 + +CONFIG_DIR=configs/microannealing +CONFIG_PATH=${CONFIG_DIR}/${CONFIG_NAME}.yaml + +gantry run \ + --workspace ai2/davidw-oe-annealing \ + --task-name ${CONFIG_NAME} \ + --description ${CONFIG_NAME} \ + --priority $PRIORITY \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --synchronized-start-timeout 90m \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/peteish/peteish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK ${CONFIG_PATH}" diff --git a/scripts/microanneal_config_maker.py b/scripts/microanneal_config_maker.py new file mode 100644 index 000000000..9ec63acf3 --- /dev/null +++ b/scripts/microanneal_config_maker.py @@ -0,0 +1,538 @@ +import os +import random +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import List, MutableMapping, Tuple +from urllib.parse import urlparse + +import boto3 +import yaml +from tabulate import tabulate +from tqdm.auto import tqdm + +# =================================================== +# = S3 HELPERS = +# =================================================== + + +def get_single_s3_size(s3_uri: str, s3_client=None) -> int: + # Gets the size in bytes of an individual s3 path + parsed = urlparse(s3_uri) + bucket_name = parsed.netloc + # Remove leading slash and handle edge cases + object_key = parsed.path.lstrip("/") + try: + s3_client = boto3.client("s3") + response = s3_client.head_object(Bucket=bucket_name, Key=object_key) + return response["ContentLength"] + except Exception as e: + if hasattr(e, "response") and e.response["Error"]["Code"] == "404": + raise FileNotFoundError(f"The object {object_key} does not exist in bucket {bucket_name}") + else: + raise + + +def get_batch_s3_size(s3_uris: List[str]): + # Faster way to get size in bytes for a lot of s3 paths: maps s3_uri -> size + s3_client = boto3.client("s3") + + def partial_size(s3_uri: str): + size = get_single_s3_size(s3_uri, s3_client=s3_client) + return s3_uri, size + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(partial_size, uri) for uri in s3_uris] + results = [] + for future in tqdm(futures, total=len(futures)): + results.append(future.result()) + + # Convert results to dictionary + sizes = dict(results) + return sizes + + +def list_s3_paths(s3_uri: str, extension: str = ".npy") -> List[Tuple[str, int]]: + """ + Lists all paths in an S3 bucket with given prefix and extension, along with their sizes. + + Args: + bucket_name (str): Name of the S3 bucket + prefix (str): Prefix to filter objects (e.g., 'data/') + extension (str): File extension to filter (e.g., '.csv') + + Returns: + List[Tuple[str, int]]: List of tuples containing (path, size in bytes) + """ + parsed = urlparse(s3_uri) + bucket_name = parsed.netloc + + # Remove leading slash and handle edge cases + prefix = parsed.path.lstrip("/") + + s3_client = boto3.client("s3") + + # Ensure prefix ends with '/' if it's meant to be a directory + if prefix and not prefix.endswith("/"): + prefix += "/" + + # Ensure extension starts with '.' + if not extension.startswith("."): + extension = "." + extension + + paths_and_sizes = [] + paginator = s3_client.get_paginator("list_objects_v2") + + try: + # Paginate through results to handle large buckets + for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix): + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + if key.endswith(extension): + paths_and_sizes.append((key, obj["Size"])) + + return paths_and_sizes + + except Exception as e: + print(f"Error listing objects: {str(e)}") + return [] + + +# ================================================================= +# = Other config-specific helpers = +# ================================================================= + +BASE_YAML_STR = """run_name: REPLACE_RUN_NAME_HERE +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: REPLACE_LR_HERE + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: REPLACE_PATH_HERE + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 1ep +# stop_at: 11931 # Relying on max_duration for anneals +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths:""" + + +def human_format_number(num, decimal_places=2): + """ + Format a number using K for thousands, M for millions, B for billions, T for trillions. + + Args: + num: Number to format + decimal_places: Number of decimal places to show (default: 2) + + Examples: + format_number(999) => '999' + format_number(1000) => '1.00K' + format_number(1500) => '1.50K' + format_number(1000000) => '1.00M' + format_number(1500000000) => '1.50B' + """ + abs_num = abs(num) + sign = "-" if num < 0 else "" + + if abs_num < 1000: + return f"{sign}{abs_num}" + + suffixes = ["", "K", "M", "B", "T"] + magnitude = 0 + + while abs_num >= 1000 and magnitude < len(suffixes) - 1: + abs_num /= 1000 + magnitude += 1 + + # Format with specified decimal places + formatted = f"{abs_num:.{decimal_places}f}" + + return f"{sign}{formatted}{suffixes[magnitude]}" + + +def get_token_strs(token_source, bytes_per_token=4): + if isinstance(token_source, str): + s3_source = token_source + ratio = 1.0 + else: + s3_source, ratio = token_source + + paths_and_sizes = list_s3_paths(s3_source) + parsed = urlparse(s3_source) + bucket_name = parsed.netloc + paths_and_sizes = [("s3://%s/%s" % (bucket_name, p), s) for p, s in paths_and_sizes] + random.shuffle(paths_and_sizes) + total_tokens = sum(_[1] for _ in paths_and_sizes) // bytes_per_token + target_tokens = total_tokens * ratio + + paths_to_add = [] + tokens_to_add = 0 + for p, s in paths_and_sizes: + paths_to_add.append(p) + tokens_to_add += s // bytes_per_token + if tokens_to_add >= target_tokens: + break + lines_to_add = ["#SOURCE: %s (%sT)" % (s3_source, human_format_number(tokens_to_add))] + for p in paths_to_add: + lines_to_add.append("- %s" % p) + return lines_to_add + + +def add_paths(token_sources, output_yaml_file, start_point="preanneal"): + # Adds things to the yaml file. + # Token sources is a list of either... s3_uri: str | (s3_uri: str, fraction: float) + # Also I'm not bothering with pyyaml, just appending to the base config (which will be included) + # ^this is a very crude stone-age tool, don't @ me + + assert os.path.basename(output_yaml_file).startswith("peteish7-weka-microanneal") + assert output_yaml_file.endswith(".yaml") + + assert start_point in ["preanneal", "megamath5000"] + base_config_str = BASE_YAML_STR.replace( + "REPLACE_RUN_NAME_HERE", os.path.splitext(os.path.basename(output_yaml_file))[0] + ) + + # Change input model, LR + if start_point == "preanneal": + base_config_str = base_config_str.replace( + "REPLACE_PATH_HERE", "/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646" + ) + base_config_str = base_config_str.replace("REPLACE_LR_HERE", "0.000061499") + elif start_point == "megamath5000": + base_config_str = base_config_str.replace( + "REPLACE_PATH_HERE", + "/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml/step5000/", + ) + new_lr = "%.09f" % (0.000061499 * (1 - (5000 / 11931))) + base_config_str = base_config_str.replace("REPLACE_LR_HERE", new_lr) + + lines_to_add = [] + for source in token_sources: + lines_to_add.extend(get_token_strs(source)) + true_lines_to_add = ["\n %s" % line for line in lines_to_add] + output_str = base_config_str + "".join(true_lines_to_add) + with open(output_yaml_file, "w") as f: + f.write(output_str) + + +def examine_config(yaml_file, bytes_per_token=4): + """ + Groups the token sources by their dirname and computes sizes and how much data was taken total. + Prints out rows of: + (token source, total_tokens, percentage_taken, tokens_taken) + """ + + print("Getting tokens per input file...") + # Step 1: collect all paths of tokens + with open(yaml_file, "r") as f: + yaml_content = yaml.safe_load(f) + paths = yaml_content.get("data", {}).get("paths", []) + paths_to_tokens = {k: v // bytes_per_token for k, v in get_batch_s3_size(paths).items()} + + # Step 2: Gather all sources, count tokens taken + print("Grouping output files into groups...") + groups = set(_read_path_comments(yaml_file)) + + def get_group(s3_uri): + for g in groups: + if s3_uri.startswith(g): + return g + raise Exception("UNKNOWN GROUP FOR %s" % s3_uri) + + tokens_taken: MutableMapping[str, int] = defaultdict(int) + for p, tok in paths_to_tokens.items(): + tokens_taken[get_group(p)] += tok + + # Step 3: count total tokens per group + print("Getting total group sizes...") + total_tokens = {} + for g in tqdm(groups): + paths_and_sizes = list_s3_paths(g) + total_tokens[g] = sum(_[1] for _ in paths_and_sizes) // bytes_per_token + print("TOTAL_TOKENS", total_tokens) + # Step 4: get ratios of percentage taken + ratios = { + g: "%.04f" % (tokens_taken[g] / total_tokens[g]) for g in groups + } # .04f here (ranging from 0.00 to 1.00) + + # Step 5: actually print the outputs + rows = sorted([(g, total_tokens[g], ratios[g], tokens_taken[g]) for g in groups]) + print("Put this in your spreadsheet!") + print(tabulate(rows, headers=["paths", "total_tokens", "percentage taken", "tokens taken"])) + + +def _read_path_comments(yaml_file): + # This is helpful for examining paths + lines = open(yaml_file, "r").readlines() + path_sources = [] + seen_paths = False + for line in lines: + if not seen_paths and line.strip() != "paths:": + continue + elif line.strip() == "paths:": + seen_paths = True + elif line.strip().startswith("#"): + path_sources.append(line.strip().split(" ")[1]) + else: + pass + return path_sources + + +# ================================================= +# = MAIN = +# ================================================= + + +if __name__ == "__main__": + """ + Use this interactively like `python -i peteish7_config_maker.py`, since tuples are weird to pass + [ or load all these modules into a jupyter notebook ] + Usage example: + MATH_TOKENS = ['s3://ai2-llm/preprocessed/personahub_math_v2_79975/', # uses 100% of this dataset + 's3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer', # uses 100% of this dataset + ('s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/allenai/dolma2-tokenizer/', 0.5) # uses 50% of this dataset + ] + OUTPUT_YAML = 'peteish7-weka-anneal-from-928646-50B-test_math.yaml' + add_paths(MATH_TOKENS, OUTPUT_YAML) + + # and then you can populate the spreadsheet with the output of examine_config + print(examine_config(OUTPUT_YAML)) + """ diff --git a/scripts/peteish7_config_maker.py b/scripts/peteish7_config_maker.py new file mode 100644 index 000000000..4acfdd790 --- /dev/null +++ b/scripts/peteish7_config_maker.py @@ -0,0 +1,523 @@ +import os +import random +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, MutableMapping, Tuple +from urllib.parse import urlparse + +import boto3 +import yaml +from tabulate import tabulate +from tqdm.auto import tqdm + +# =================================================== +# = S3 HELPERS = +# =================================================== + + +def get_single_s3_size(s3_uri: str, s3_client=None) -> int: + # Gets the size in bytes of an individual s3 path + parsed = urlparse(s3_uri) + bucket_name = parsed.netloc + # Remove leading slash and handle edge cases + object_key = parsed.path.lstrip("/") + try: + response = s3_client.head_object(Bucket=bucket_name, Key=object_key) + return response["ContentLength"] + except Exception as e: + if hasattr(e, "response") and e.response["Error"]["Code"] == "404": + raise FileNotFoundError(f"The object {object_key} does not exist in bucket {bucket_name}.") + else: + raise + + +def get_batch_s3_size(s3_uris: List[str]): + # Faster way to get size in bytes for a lot of s3 paths: maps s3_uri -> size + s3_client = boto3.client("s3") + + def partial_size(s3_uri: str): + size = get_single_s3_size(s3_uri, s3_client=s3_client) + return s3_uri, size + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(partial_size, uri) for uri in s3_uris] + results = [] + for future in tqdm(futures, total=len(futures)): + results.append(future.result()) + + # Convert results to dictionary + sizes: Dict[str, int] = {} + for s3_uri, size in results: + sizes[s3_uri] = sizes.get(s3_uri, 0) + size + # sizes = dict(results) + return sizes + + +def list_s3_paths(s3_uri: str, extension: str = ".npy") -> List[Tuple[str, int]]: + """ + Lists all paths in an S3 bucket with given prefix and extension, along with their sizes. + + Args: + bucket_name (str): Name of the S3 bucket + prefix (str): Prefix to filter objects (e.g., 'data/') + extension (str): File extension to filter (e.g., '.csv') + + Returns: + List[Tuple[str, int]]: List of tuples containing (path, size in bytes) + """ + parsed = urlparse(s3_uri) + bucket_name = parsed.netloc + + # Remove leading slash and handle edge cases + prefix = parsed.path.lstrip("/") + + s3_client = boto3.client("s3") + + # Ensure prefix ends with '/' if it's meant to be a directory + if prefix and not prefix.endswith("/"): + prefix += "/" + + # Ensure extension starts with '.' + if not extension.startswith("."): + extension = "." + extension + + paths_and_sizes = [] + paginator = s3_client.get_paginator("list_objects_v2") + + try: + # Paginate through results to handle large buckets + for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix): + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + if key.endswith(extension): + paths_and_sizes.append((key, obj["Size"])) + + return paths_and_sizes + + except Exception as e: + print(f"Error listing objects: {str(e)}") + return [] + + +# ================================================================= +# = Other config-specific helpers = +# ================================================================= + +BASE_YAML_STR = """run_name: REPLACE_RUN_NAME_HERE +seed: 7201 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 0.000061499 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: linear_with_warmup + t_warmup: 0 + alpha_f: 0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646 + +restore_dataloader: false +no_pre_train_checkpoint: true + +max_duration: 50e9T +stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +activation_checkpointing: one_in_four + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths:""" + + +def human_format_number(num, decimal_places=2): + """ + Format a number using K for thousands, M for millions, B for billions, T for trillions. + + Args: + num: Number to format + decimal_places: Number of decimal places to show (default: 2) + + Examples: + format_number(999) => '999' + format_number(1000) => '1.00K' + format_number(1500) => '1.50K' + format_number(1000000) => '1.00M' + format_number(1500000000) => '1.50B' + """ + abs_num = abs(num) + sign = "-" if num < 0 else "" + + if abs_num < 1000: + return f"{sign}{abs_num}" + + suffixes = ["", "K", "M", "B", "T"] + magnitude = 0 + + while abs_num >= 1000 and magnitude < len(suffixes) - 1: + abs_num /= 1000 + magnitude += 1 + + # Format with specified decimal places + formatted = f"{abs_num:.{decimal_places}f}" + + return f"{sign}{formatted}{suffixes[magnitude]}" + + +def get_token_strs(token_source, bytes_per_token=4): + if isinstance(token_source, str): + s3_source = token_source + ratio = 1.0 + else: + s3_source, ratio = token_source + + paths_and_sizes = list_s3_paths(s3_source) + parsed = urlparse(s3_source) + bucket_name = parsed.netloc + paths_and_sizes = [("s3://%s/%s" % (bucket_name, p), s) for p, s in paths_and_sizes] + random.shuffle(paths_and_sizes) + total_tokens = sum(_[1] for _ in paths_and_sizes) // bytes_per_token + target_tokens = total_tokens * ratio + + paths_to_add = [] + tokens_to_add = 0 + for p, s in paths_and_sizes: + paths_to_add.append(p) + tokens_to_add += s // bytes_per_token + if tokens_to_add >= target_tokens: + break + lines_to_add = ["#SOURCE: %s (%sT)" % (s3_source, human_format_number(tokens_to_add))] + for p in paths_to_add: + lines_to_add.append("- %s" % p) + return lines_to_add + + +def add_paths(token_sources, output_yaml_file): + # Adds things to the yaml file. + # Token sources is a list of either... s3_uri: str | (s3_uri: str, fraction: float) + # Also I'm not bothering with pyyaml, just appending to the base config (which will be included) + # ^this is a very crude stone-age tool, don't @ me + assert output_yaml_file.startswith("peteish7-weka-anneal-from-928646-50B-") + assert output_yaml_file.endswith(".yaml") + + base_config_str = BASE_YAML_STR.replace( + "REPLACE_RUN_NAME_HERE", os.path.splitext(os.path.basename(output_yaml_file))[0] + ) + + lines_to_add = [] + for source in token_sources: + lines_to_add.extend(get_token_strs(source)) + true_lines_to_add = ["\n %s" % line for line in lines_to_add] + output_str = base_config_str + "".join(true_lines_to_add) + with open(output_yaml_file, "w") as f: + f.write(output_str) + + +def examine_config(yaml_file, bytes_per_token=4): + """ + Groups the token sources by their dirname and computes sizes and how much data was taken total. + Prints out rows of: + (token source, total_tokens, percentage_taken, tokens_taken) + """ + + print("Getting tokens per input file...") + # Step 1: collect all paths of tokens + with open(yaml_file, "r") as f: + yaml_content = yaml.safe_load(f) + paths = yaml_content.get("data", {}).get("paths", []) + paths_to_tokens = {k: v // bytes_per_token for k, v in get_batch_s3_size(paths).items()} + + # Step 2: Gather all sources, count tokens taken + print("Grouping output files into groups...") + groups = set(_read_path_comments(yaml_file)) + + def get_group(s3_uri): + for g in groups: + if s3_uri.startswith(g): + return g + raise Exception("UNKNOWN GROUP FOR %s" % s3_uri) + + tokens_taken: MutableMapping[str, int] = defaultdict(int) + for p, tok in paths_to_tokens.items(): + tokens_taken[get_group(p)] += tok + + # Step 3: count total tokens per group + print("Getting total group sizes...") + total_tokens = {} + for g in tqdm(groups): + paths_and_sizes = list_s3_paths(g) + total_tokens[g] = sum(_[1] for _ in paths_and_sizes) // bytes_per_token + # Step 4: get ratios of percentage taken + ratios = { + g: "%.04f" % (tokens_taken[g] / total_tokens[g]) for g in groups + } # .04f here (ranging from 0.00 to 1.00) + + # Step 5: actually print the outputs + rows = sorted([(g, total_tokens[g], ratios[g], tokens_taken[g]) for g in groups]) + print("Put this in your spreadsheet!") + print(tabulate(rows, headers=["paths", "total_tokens", "percentage taken", "tokens taken"])) + + +def _read_path_comments(yaml_file): + # This is helpful for examining paths + lines = open(yaml_file, "r").readlines() + path_sources = [] + seen_paths = False + for line in lines: + if not seen_paths and line.strip() != "paths:": + continue + elif line.strip() == "paths:": + seen_paths = True + elif line.strip().startswith("#"): + path_sources.append(line.strip().split(" ")[1]) + else: + pass + return path_sources + + +# ================================================= +# = MAIN = +# ================================================= + + +if __name__ == "__main__": + """ + Use this interactively like `python -i peteish7_config_maker.py`, since tuples are weird to pass + [ or load all these modules into a jupyter notebook ] + Usage example: + MATH_TOKENS = ['s3://ai2-llm/preprocessed/personahub_math_v2_79975/', # uses 100% of this dataset + 's3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer', # uses 100% of this dataset + ('s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/allenai/dolma2-tokenizer/', 0.5) # uses 50% of this dataset + ] + OUTPUT_YAML = 'peteish7-weka-anneal-from-928646-50B-test_math.yaml' + add_paths(MATH_TOKENS, OUTPUT_YAML) + + # and then you can populate the spreadsheet with the output of examine_config + print(examine_config(OUTPUT_YAML)) + """