From 36ba37a88fc0adb14a0d72aa5c37073687c6172b Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Mon, 25 Nov 2024 17:53:30 -0800 Subject: [PATCH] Removing non-peteish configs --- configs/amberish1-weka.yaml | 1297 ----- configs/amberish13-weka.yaml | 1293 ----- configs/amberish7-weka.yaml | 1293 ----- configs/amberish70-weka.yaml | 1294 ----- configs/annealing/OLMo-7B.yaml | 206 - .../amberish7-anneal-from477850-50B.yaml | 381 -- ...chish7-anneal-from477000-50B-flan_fix.yaml | 326 -- .../mitchish7-anneal-from616350-50B.yaml | 388 -- ...tafix-anneal-from639650-50B-warmup-lr.yaml | 387 -- ...tchish7-datafix-anneal-from639650-50B.yaml | 379 -- .../annealing/olmo70b-from160510-100B.yaml | 377 -- .../annealing/olmo70b-from205000-100B.yaml | 377 -- .../annealing/olmo70b-from205000-150B.yaml | 489 -- .../annealing/olmo70b-from205000-300B.yaml | 526 -- .../olmo70b-resume_optimizer-steps_50B.yaml | 374 -- .../v0-step_1.5T-superweb25-warmup_true.yaml | 393 -- .../v0-step_1.5T-warmup_true-flan_false.yaml | 329 -- .../v0-step_1.5T-warmup_true-steps_50B.yaml | 371 -- .../annealing/v0-step_1.5T-warmup_true.yaml | 381 -- configs/annealing/v0-step_1T-warmup_true.yaml | 371 -- ...ne-step_2T-resume_optimizer-steps_50B.yaml | 1284 ----- ...a-step_2T-resume_optimizer-steps_100B.yaml | 380 -- ...a-step_2T-resume_optimizer-steps_200B.yaml | 482 -- ...ma-step_2T-resume_optimizer-steps_50B.yaml | 380 -- ...-step_2.1T-resume_optimizer-steps_50B.yaml | 375 -- .../v1.7-step_2T-cos_schedule-steps_50B.yaml | 374 -- ...7-step_2T-resume_optimizer-steps_100B.yaml | 375 -- ...7-step_2T-resume_optimizer-steps_200B.yaml | 462 -- ...e_optimizer-steps_50B-flan_downweight.yaml | 342 -- ....7-step_2T-resume_optimizer-steps_50B.yaml | 375 -- ...resume_optimizer-steps_50B_seed_76395.yaml | 377 -- ...p_2T-warmup_true-steps_50B-flan_false.yaml | 323 -- .../v1.7-step_2T-warmup_true-steps_50B.yaml | 378 -- configs/c4-extra-tiny-debug.yaml | 112 - configs/c4-large.yaml | 186 - configs/c4-medium.yaml | 182 - configs/c4-small.yaml | 183 - configs/c4-tiny.yaml | 188 - configs/llama7-s3.yaml | 623 --- configs/llama7.yaml | 176 - configs/llamaish1-s3.yaml | 1297 ----- configs/llamaish7-s3.yaml | 1296 ----- configs/mcli/.gitignore | 1 - configs/mcli/ananya-1b-ib.yaml | 4438 ---------------- configs/mcli/ananya-1b.yaml | 4403 ---------------- configs/mcli/harvest_hostnames.yaml | 8 - configs/mcli/mitchish-final.yaml | 110 - configs/mcli/mitchish-instruct.yml | 104 - configs/mcli/mitchish.yaml | 52 - configs/mcli/mitchish1.yaml | 61 - configs/mcli/mitchish7.yaml | 67 - configs/mcli/mitchish70-from160510.yaml | 227 - configs/mcli/mitchish70.yaml | 96 - configs/mcli/mosaic-ananya-1b.yaml | 21 - configs/mcli/olmo7-ablation-baseline.yaml | 47 - configs/mcli/olmo7-ablation-dedupedocs.yaml | 46 - configs/mcli/olmo7-ablation-dolma17.yaml | 47 - configs/mcli/v1-mix-medium-mitch-ish.yaml | 32 - configs/mcli/v1-mix-medium.yaml | 33 - configs/mcli/v1_5-mix-medium-mitch-ish.yaml | 47 - configs/mcli/v1_5-mix-medium.yaml | 33 - configs/mitchish-instruct.yaml | 148 - configs/mitchish1-s3.yaml | 1277 ----- configs/mitchish35.yaml | 183 - configs/mitchish50.yaml | 183 - configs/mitchish65-s3.yaml | 634 --- configs/mitchish65.yaml | 184 - configs/mitchish7-llamainit-s3.yaml | 1280 ----- configs/mitchish7-s3.yaml | 1282 ----- configs/mitchish70-s3.yaml | 1273 ----- configs/mitchish70.yaml | 201 - configs/olmo-small-ablation.yaml | 249 - configs/olmo7-ablation-baseline.yaml | 640 --- configs/olmo7-ablation-dedupedocs.yaml | 1618 ------ configs/olmo7-ablation-dedupeparas.yaml | 1625 ------ configs/olmo7-ablation-dolma17.yaml | 1491 ------ configs/olmo7-ablation-final2.yaml | 1258 ----- configs/olmo7-ablation-refheavy.yaml | 1704 ------- configs/pile-llamaish7-s3.yaml | 528 -- configs/pile-llamaish7.yaml | 227 - configs/tiny-llamaish-s3.yaml | 1284 ----- configs/v1-mix-medium-mitch-ish-s3.yaml | 4420 ---------------- configs/v1-mix-medium-mitch-ish.yaml | 170 - configs/v1-mix-medium-s3.yaml | 4443 ----------------- configs/v1-mix-medium.yaml | 165 - configs/v1-mix-small-s3.yaml | 4435 ---------------- configs/v1-mix-small.yaml | 187 - configs/v1_5-mix-medium-mitch-ish-s3.yaml | 625 --- configs/v1_5-mix-medium-mitch-ish.yaml | 179 - configs/v1_5-mix-medium-s3.yaml | 645 --- 90 files changed, 63793 deletions(-) delete mode 100644 configs/amberish1-weka.yaml delete mode 100644 configs/amberish13-weka.yaml delete mode 100644 configs/amberish7-weka.yaml delete mode 100644 configs/amberish70-weka.yaml delete mode 100644 configs/annealing/OLMo-7B.yaml delete mode 100644 configs/annealing/amberish7-anneal-from477850-50B.yaml delete mode 100644 configs/annealing/mitchish7-anneal-from477000-50B-flan_fix.yaml delete mode 100644 configs/annealing/mitchish7-anneal-from616350-50B.yaml delete mode 100644 configs/annealing/mitchish7-datafix-anneal-from639650-50B-warmup-lr.yaml delete mode 100644 configs/annealing/mitchish7-datafix-anneal-from639650-50B.yaml delete mode 100644 configs/annealing/olmo70b-from160510-100B.yaml delete mode 100644 configs/annealing/olmo70b-from205000-100B.yaml delete mode 100644 configs/annealing/olmo70b-from205000-150B.yaml delete mode 100644 configs/annealing/olmo70b-from205000-300B.yaml delete mode 100644 configs/annealing/olmo70b-resume_optimizer-steps_50B.yaml delete mode 100644 configs/annealing/v0-step_1.5T-superweb25-warmup_true.yaml delete mode 100644 configs/annealing/v0-step_1.5T-warmup_true-flan_false.yaml delete mode 100644 configs/annealing/v0-step_1.5T-warmup_true-steps_50B.yaml delete mode 100644 configs/annealing/v0-step_1.5T-warmup_true.yaml delete mode 100644 configs/annealing/v0-step_1T-warmup_true.yaml delete mode 100644 configs/annealing/v1.7-baseline-step_2T-resume_optimizer-steps_50B.yaml delete mode 100644 configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B.yaml delete mode 100644 configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B.yaml delete mode 100644 configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B.yaml delete mode 100644 configs/annealing/v1.7-step_2.1T-resume_optimizer-steps_50B.yaml delete mode 100644 configs/annealing/v1.7-step_2T-cos_schedule-steps_50B.yaml delete mode 100644 configs/annealing/v1.7-step_2T-resume_optimizer-steps_100B.yaml delete mode 100644 configs/annealing/v1.7-step_2T-resume_optimizer-steps_200B.yaml delete mode 100644 configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight.yaml delete mode 100644 configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B.yaml delete mode 100644 configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B_seed_76395.yaml delete mode 100644 configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml delete mode 100644 configs/annealing/v1.7-step_2T-warmup_true-steps_50B.yaml delete mode 100644 configs/c4-extra-tiny-debug.yaml delete mode 100644 configs/c4-large.yaml delete mode 100644 configs/c4-medium.yaml delete mode 100644 configs/c4-small.yaml delete mode 100644 configs/c4-tiny.yaml delete mode 100644 configs/llama7-s3.yaml delete mode 100644 configs/llama7.yaml delete mode 100644 configs/llamaish1-s3.yaml delete mode 100644 configs/llamaish7-s3.yaml delete mode 100644 configs/mcli/.gitignore delete mode 100644 configs/mcli/ananya-1b-ib.yaml delete mode 100644 configs/mcli/ananya-1b.yaml delete mode 100644 configs/mcli/harvest_hostnames.yaml delete mode 100644 configs/mcli/mitchish-final.yaml delete mode 100644 configs/mcli/mitchish-instruct.yml delete mode 100644 configs/mcli/mitchish.yaml delete mode 100644 configs/mcli/mitchish1.yaml delete mode 100644 configs/mcli/mitchish7.yaml delete mode 100644 configs/mcli/mitchish70-from160510.yaml delete mode 100644 configs/mcli/mitchish70.yaml delete mode 100644 configs/mcli/mosaic-ananya-1b.yaml delete mode 100644 configs/mcli/olmo7-ablation-baseline.yaml delete mode 100644 configs/mcli/olmo7-ablation-dedupedocs.yaml delete mode 100644 configs/mcli/olmo7-ablation-dolma17.yaml delete mode 100644 configs/mcli/v1-mix-medium-mitch-ish.yaml delete mode 100644 configs/mcli/v1-mix-medium.yaml delete mode 100644 configs/mcli/v1_5-mix-medium-mitch-ish.yaml delete mode 100644 configs/mcli/v1_5-mix-medium.yaml delete mode 100644 configs/mitchish-instruct.yaml delete mode 100644 configs/mitchish1-s3.yaml delete mode 100644 configs/mitchish35.yaml delete mode 100644 configs/mitchish50.yaml delete mode 100644 configs/mitchish65-s3.yaml delete mode 100644 configs/mitchish65.yaml delete mode 100644 configs/mitchish7-llamainit-s3.yaml delete mode 100644 configs/mitchish7-s3.yaml delete mode 100644 configs/mitchish70-s3.yaml delete mode 100644 configs/mitchish70.yaml delete mode 100644 configs/olmo-small-ablation.yaml delete mode 100644 configs/olmo7-ablation-baseline.yaml delete mode 100644 configs/olmo7-ablation-dedupedocs.yaml delete mode 100644 configs/olmo7-ablation-dedupeparas.yaml delete mode 100644 configs/olmo7-ablation-dolma17.yaml delete mode 100644 configs/olmo7-ablation-final2.yaml delete mode 100644 configs/olmo7-ablation-refheavy.yaml delete mode 100644 configs/pile-llamaish7-s3.yaml delete mode 100644 configs/pile-llamaish7.yaml delete mode 100644 configs/tiny-llamaish-s3.yaml delete mode 100644 configs/v1-mix-medium-mitch-ish-s3.yaml delete mode 100644 configs/v1-mix-medium-mitch-ish.yaml delete mode 100644 configs/v1-mix-medium-s3.yaml delete mode 100644 configs/v1-mix-medium.yaml delete mode 100644 configs/v1-mix-small-s3.yaml delete mode 100644 configs/v1-mix-small.yaml delete mode 100644 configs/v1_5-mix-medium-mitch-ish-s3.yaml delete mode 100644 configs/v1_5-mix-medium-mitch-ish.yaml delete mode 100644 configs/v1_5-mix-medium-s3.yaml diff --git a/configs/amberish1-weka.yaml b/configs/amberish1-weka.yaml deleted file mode 100644 index 63964e917..000000000 --- a/configs/amberish1-weka.yaml +++ /dev/null @@ -1,1297 +0,0 @@ -run_name: amberish1-run001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: ${run_name} - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - bias_for_layer_norm: false - attention_layer_norm_with_affine: true - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 4.0e-4 - weight_decay: 0.1 - eps: 1e-8 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 1 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 8388608000 - t_max: 3e12 - alpha_f: 0.1 - warmup_min_lr: 0.0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/${run_name} -# remote_save_folder: weka://oe-training-default/ai2-llm/checkpoints/OLMo-small/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 500 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 1024 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: null - sharding_strategy: SHARD_GRAD_OP - precision: mixed - -# activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -gen1_gc_interval: 10 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - generate_doc_lengths: true - datasets: - c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/amberish13-weka.yaml b/configs/amberish13-weka.yaml deleted file mode 100644 index 9e1f8e442..000000000 --- a/configs/amberish13-weka.yaml +++ /dev/null @@ -1,1293 +0,0 @@ -run_name: amberish13-run001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: ${run_name} - -model: - d_model: 5120 - n_heads: 40 - n_layers: 40 - mlp_hidden_size: 27648 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - bias_for_layer_norm: false - attention_layer_norm_with_affine: true - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - eps: 1e-8 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 1 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 8388608000 - t_max: 3e12 - alpha_f: 0.1 - warmup_min_lr: 0.0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} -# remote_save_folder: weka://oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 250 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 1024 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -# activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/amberish7-weka.yaml b/configs/amberish7-weka.yaml deleted file mode 100644 index ab6ce3e12..000000000 --- a/configs/amberish7-weka.yaml +++ /dev/null @@ -1,1293 +0,0 @@ -run_name: amberish7-run001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: ${run_name} - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - bias_for_layer_norm: false - attention_layer_norm_with_affine: true - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - eps: 1e-8 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 1 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 8388608000 - t_max: 3e12 - alpha_f: 0.1 - warmup_min_lr: 0.0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} -# remote_save_folder: weka://oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 250 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 1024 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -# activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/amberish70-weka.yaml b/configs/amberish70-weka.yaml deleted file mode 100644 index 09225d5a8..000000000 --- a/configs/amberish70-weka.yaml +++ /dev/null @@ -1,1294 +0,0 @@ -run_name: amberish70-run001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: ${run_name} - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - bias_for_layer_norm: false - attention_layer_norm_with_affine: true - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - eps: 1e-8 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 8388608000 - t_max: 3e12 - alpha_f: 0.1 - warmup_min_lr: 0.0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-large/${run_name} -# remote_save_folder: weka://oe-training-default/ai2-llm/checkpoints/OLMo-large/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 250 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 1024 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/annealing/OLMo-7B.yaml b/configs/annealing/OLMo-7B.yaml deleted file mode 100644 index 1e31be11a..000000000 --- a/configs/annealing/OLMo-7B.yaml +++ /dev/null @@ -1,206 +0,0 @@ -run_name: OLMo-7B -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: OLMo-7B-annealing # TODO: change to what you like - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 3.0e-4 # TODO: change to your peak learning - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: # TODO: change to what you want - name: linear_with_warmup - t_warmup: 100 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/oe-data-annealing/${run_name} -save_overwrite: true -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false # TODO: this should only be 'false' initially - -load_path: /net/nfs/allennlp/llm-checkpoints/step551000-unsharded #TODO: change this - -max_duration: null -global_train_batch_size: 2048 # TODO: adjust as needed -device_train_microbatch_size: 2 # TODO: adjust as needed -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: v3-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - v3-small-c4_en-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - v3-small-dolma_books-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - v3-small-dolma_common-crawl-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - v3-small-dolma_pes2o-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - v3-small-dolma_reddit-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - v3-small-dolma_stack-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - v3-small-dolma_wiki-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - v3-small-ice-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - v3-small-m2d2_s2orc-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - v3-small-pile-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - v3-small-wikitext_103-validation: - - r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - - label: v2-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - v2-small-4chan-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - v2-small-c4_100_domains-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - v2-small-c4_en-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - v2-small-gab-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - v2-small-ice-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - v2-small-m2d2_s2orc-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - v2-small-m2d2_wiki-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - v2-small-manosphere-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - v2-small-mc4_en-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - v2-small-pile-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - v2-small-ptb-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - v2-small-twitterAEE-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - v2-small-wikitext_103-validation: - - r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/data/... # TODO: update these paths diff --git a/configs/annealing/amberish7-anneal-from477850-50B.yaml b/configs/annealing/amberish7-anneal-from477850-50B.yaml deleted file mode 100644 index d681939c0..000000000 --- a/configs/annealing/amberish7-anneal-from477850-50B.yaml +++ /dev/null @@ -1,381 +0,0 @@ -run_name: amberish7-anneal-from477850-50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: amberish7-anneal-from477850-50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009732 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - eps: 1e-8 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/ -save_overwrite: true -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - - -# final checkpoint for new 7b model. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/amberish7/step477850-unsharded - -restore_dataloader: false -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.6 GT) - #################################### - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/mitchish7-anneal-from477000-50B-flan_fix.yaml b/configs/annealing/mitchish7-anneal-from477000-50B-flan_fix.yaml deleted file mode 100644 index 7d8bb5e0d..000000000 --- a/configs/annealing/mitchish7-anneal-from477000-50B-flan_fix.yaml +++ /dev/null @@ -1,326 +0,0 @@ -run_name: mitchish7-anneal-from477000-50B-flan_fix -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7-anneal-from477000-50B-flan_fix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/ -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - - -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -restore_dataloader: false -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.6 G ) - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_mix/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/mitchish7-anneal-from616350-50B.yaml b/configs/annealing/mitchish7-anneal-from616350-50B.yaml deleted file mode 100644 index b43bcf346..000000000 --- a/configs/annealing/mitchish7-anneal-from616350-50B.yaml +++ /dev/null @@ -1,388 +0,0 @@ -run_name: mitchish7-anneal-from616350-50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7-anneal-from616350-50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00004262 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/ -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - - -# # 2.6T token final checkpoint for new 7B model. -# load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step616350 -# NOTE(davidw) restart from failed checkpoint -load_path: s3://ai2-llm/checkpoints/davidw/annealing/mitchish7-anneal-from616350-50B/step2000 - -# restore_dataloader: false -# no_pre_train_checkpoint: true -# reset_optimizer_state: false -# reset_trainer_state: true - -restore_dataloader: true -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: false - - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/mitchish7-datafix-anneal-from639650-50B-warmup-lr.yaml b/configs/annealing/mitchish7-datafix-anneal-from639650-50B-warmup-lr.yaml deleted file mode 100644 index 50136a4e5..000000000 --- a/configs/annealing/mitchish7-datafix-anneal-from639650-50B-warmup-lr.yaml +++ /dev/null @@ -1,387 +0,0 @@ -run_name: mitchish7-datafix-anneal-from639650-50B-warmup-lr -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7-datafix-anneal-from639650-50B-warmup-lr - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/ -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - - -# Final checkpoint for new 7B model. -# load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7-datafix/step639650 - -# NOTE(davidw) resume training. -load_path: s3://ai2-llm/checkpoints/davidw/annealing/mitchish7-datafix-anneal-from639650-50B-warmup-lr/step1000 - -# restore_dataloader: false -# no_pre_train_checkpoint: true -# reset_optimizer_state: false -# reset_trainer_state: true - -restore_dataloader: true -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: false - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.6 GT) - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/mitchish7-datafix-anneal-from639650-50B.yaml b/configs/annealing/mitchish7-datafix-anneal-from639650-50B.yaml deleted file mode 100644 index 1647f22db..000000000 --- a/configs/annealing/mitchish7-datafix-anneal-from639650-50B.yaml +++ /dev/null @@ -1,379 +0,0 @@ -run_name: mitchish7-datafix-anneal-from639650-50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7-datafix-anneal-from639650-50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00003743 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/ -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - - -# Final checkpoint for new 7B model. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7-datafix/step639650 - -restore_dataloader: false -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.6 GT) - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_space/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/olmo70b-from160510-100B.yaml b/configs/annealing/olmo70b-from160510-100B.yaml deleted file mode 100644 index c95b071e2..000000000 --- a/configs/annealing/olmo70b-from160510-100B.yaml +++ /dev/null @@ -1,377 +0,0 @@ -run_name: olmo70b-from160510-100B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: olmo70b-from160510-100B - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.084e-05 # was safe in previous runs - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 250 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -load_path: s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planb/step160510 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 100e9T -stop_at: 6822 # round(100e9 / (3584 * 4096)) + 10 -global_train_batch_size: 3584 -device_train_microbatch_size: 4 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/olmo70b-from205000-100B.yaml b/configs/annealing/olmo70b-from205000-100B.yaml deleted file mode 100644 index 2c6092f26..000000000 --- a/configs/annealing/olmo70b-from205000-100B.yaml +++ /dev/null @@ -1,377 +0,0 @@ -run_name: olmo70b-from205000-100B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: olmo70b-from205000-100B - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.084e-05 # lr from step 205000 minus 20% - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 250 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -load_path: s3://ai2-llm/checkpoints/OLMo-large/mitchish70-pland/step205000 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 100e9T -stop_at: 6822 # round(100e9 / (3584 * 4096)) + 10 -global_train_batch_size: 3584 -device_train_microbatch_size: 4 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/olmo70b-from205000-150B.yaml b/configs/annealing/olmo70b-from205000-150B.yaml deleted file mode 100644 index c3e593f45..000000000 --- a/configs/annealing/olmo70b-from205000-150B.yaml +++ /dev/null @@ -1,489 +0,0 @@ -run_name: olmo70b-from205000-150B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: olmo70b-from205000-150BB - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.084e-05 # lr from step 205000 minus 20% - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 250 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -load_path: s3://ai2-llm/checkpoints/OLMo-large/mitchish70-pland/step205000 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 100e9T -stop_at: 6822 # round(100e9 / (3584 * 4096)) + 10 -global_train_batch_size: 3584 -device_train_microbatch_size: 4 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (16.29 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (20.64 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (16.64 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - # ~> REDDIT (16.52 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> FALCON (16.43 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - # ~> CC news (7.03 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - # ~> Megawika (4.56 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy diff --git a/configs/annealing/olmo70b-from205000-300B.yaml b/configs/annealing/olmo70b-from205000-300B.yaml deleted file mode 100644 index 4d3410a63..000000000 --- a/configs/annealing/olmo70b-from205000-300B.yaml +++ /dev/null @@ -1,526 +0,0 @@ -run_name: olmo70b-from205000-300B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: olmo70b-from205000-300B - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.084e-05 # was safe in previous runs - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 250 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -load_path: s3://ai2-llm/checkpoints/OLMo-large/mitchish70-pland/step205000 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 300e9T -stop_at: 20446 # round(300e9 / (3584 * 4096)) + 10 -global_train_batch_size: 3584 -device_train_microbatch_size: 4 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT). - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (10.41 GT). - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.62 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (43.34 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (42.64 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - # ~> REDDIT (42.22 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - # ~> FALCON (42.88 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy diff --git a/configs/annealing/olmo70b-resume_optimizer-steps_50B.yaml b/configs/annealing/olmo70b-resume_optimizer-steps_50B.yaml deleted file mode 100644 index ea1d52984..000000000 --- a/configs/annealing/olmo70b-resume_optimizer-steps_50B.yaml +++ /dev/null @@ -1,374 +0,0 @@ -run_name: olmo70b-resume_optimizer-steps_50B_from181500 -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: olmo70b-resume_optimizer-steps_50B_from181500 - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009446 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -# TODO(dirkg) confirm correct -load_path: s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planb/step181500 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -stop_at: 4551 # round(50e9 / (2688 * 4096)) + 10 -global_train_batch_size: 2688 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (14.20 GT). Double stackexchange and remove arxiv. - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v0-step_1.5T-superweb25-warmup_true.yaml b/configs/annealing/v0-step_1.5T-superweb25-warmup_true.yaml deleted file mode 100644 index bf52bbba1..000000000 --- a/configs/annealing/v0-step_1.5T-superweb25-warmup_true.yaml +++ /dev/null @@ -1,393 +0,0 @@ -run_name: v0-step_1.5T-superweb25-warmup_true -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v0-step_1.5T-superweb25-warmup_true - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -# R2 has weird permissions issues; use S3 instead. -load_path: s3://ai2-llm/checkpoints/davidw/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (3.5 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/c4/allenai_gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (5.9 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/falcon/allenai_gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - # ~> DOLMA CC HIGH (4.1 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_head/allenai_gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_head/allenai_gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_head/allenai_gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_head/allenai_gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_head/allenai_gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - # ~> DOLMA CC MEDIUM (3.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_middle/allenai_gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_middle/allenai_gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_middle/allenai_gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_middle/allenai_gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_middle/allenai_gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - # ~> DOLMA CC LOW (2.0 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_tail/allenai_gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_tail/allenai_gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_tail/allenai_gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_tail/allenai_gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-7/documents/superhigh-25/cc_en_tail/allenai_gpt-neox-olmo-dolma-v1_5/part-004-00000.npy diff --git a/configs/annealing/v0-step_1.5T-warmup_true-flan_false.yaml b/configs/annealing/v0-step_1.5T-warmup_true-flan_false.yaml deleted file mode 100644 index 7032c80d5..000000000 --- a/configs/annealing/v0-step_1.5T-warmup_true-flan_false.yaml +++ /dev/null @@ -1,329 +0,0 @@ -run_name: v0-step_1.5T-warmup_true-flan_false -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v0-step_1.5T-warmup_true-flan_false - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -# R2 has weird permissions issues; use S3 instead. -load_path: s3://ai2-llm/checkpoints/davidw/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T - - -no_pre_train_checkpoint: true -reset_optimizer_state: true # These both are false when resetting.. -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (9.5 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (9.4 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> REDPAJAMA ARXIV (11.3 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00002.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (9.7 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.85 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - # ~> REDDIT (10.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> FALCON (11.9 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy diff --git a/configs/annealing/v0-step_1.5T-warmup_true-steps_50B.yaml b/configs/annealing/v0-step_1.5T-warmup_true-steps_50B.yaml deleted file mode 100644 index 6a483870b..000000000 --- a/configs/annealing/v0-step_1.5T-warmup_true-steps_50B.yaml +++ /dev/null @@ -1,371 +0,0 @@ -run_name: v0-step_1.5T-warmup_true-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v0-step_1.5T-warmup_true-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -# R2 has weird permissions issues; use S3 instead. -load_path: s3://ai2-llm/checkpoints/davidw/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v0-step_1.5T-warmup_true.yaml b/configs/annealing/v0-step_1.5T-warmup_true.yaml deleted file mode 100644 index 919eb5e38..000000000 --- a/configs/annealing/v0-step_1.5T-warmup_true.yaml +++ /dev/null @@ -1,381 +0,0 @@ -run_name: v0-step_1.5T-warmup_true -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v0-step_1.5T-warmup_true - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /runs # This was a mistake; should be /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -# restore_dataloader: false -# NOTE(davidw) Restore dataloader since training broke in the middle. -restore_dataloader: true - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -# R2 has weird permissions issues; use S3 instead. -# Just point to my most recent checkpoint. -# load_path: s3://ai2-llm/checkpoints/davidw/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T - -# NOTE(davidw) Job failed; restart from last available checkpoint -load_path: s3://ai2-llm/checkpoints/davidw/annealing/v0-step_1.5T-warmup_true/step1800 - -no_pre_train_checkpoint: true -# reset_optimizer_state: true # These both are false when resetting.. -# reset_trainer_state: true - -# NOTE(davidw) Job failed; needed to restart from checkpoint. -reset_optimizer_state: false -reset_trainer_state: false - -max_duration: 100e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v0-step_1T-warmup_true.yaml b/configs/annealing/v0-step_1T-warmup_true.yaml deleted file mode 100644 index 9b27b9880..000000000 --- a/configs/annealing/v0-step_1T-warmup_true.yaml +++ /dev/null @@ -1,371 +0,0 @@ -run_name: v0-step_1T-warmup_true -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v0-step_1T-warmup_true - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -# load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -# R2 has weird permissions issues; use S3 instead. -load_path: s3://ai2-llm/checkpoints/davidw/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-baseline-step_2T-resume_optimizer-steps_50B.yaml b/configs/annealing/v1.7-baseline-step_2T-resume_optimizer-steps_50B.yaml deleted file mode 100644 index f5f729195..000000000 --- a/configs/annealing/v1.7-baseline-step_2T-resume_optimizer-steps_50B.yaml +++ /dev/null @@ -1,1284 +0,0 @@ -run_name: v1.7-baseline-step_2T-resume_optimizer-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-baseline-step_2T-resume_optimizer-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -# NOTE(davidw): resume run -restore_dataloader: true - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -# load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -# NOTE(davidw) Resume run -load_path: s3://ai2-llm/checkpoints/davidw/annealing/v1.7-baseline-step_2T-resume_optimizer-steps_50B/step2900 - -no_pre_train_checkpoint: true -reset_optimizer_state: false -# NOTE(davidw) Resume run -# reset_trainer_state: true -reset_trainer_state: false - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B.yaml b/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B.yaml deleted file mode 100644 index 51344662d..000000000 --- a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B.yaml +++ /dev/null @@ -1,380 +0,0 @@ -run_name: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_100B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 100e9T -stop_at: 23852 # = round(100e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.58 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (7.01 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B.yaml b/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B.yaml deleted file mode 100644 index c39b3794b..000000000 --- a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B.yaml +++ /dev/null @@ -1,482 +0,0 @@ -run_name: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -# restore_dataloader: false -# NOTE(davidw) Resumed this run in the middle; restore dataloader from checkopint. -restore_dataloader: true - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -# load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -# NOTE(davidw): Load from last available checkpoint -# load_path: s3://ai2-llm/checkpoints/davidw/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B/step10500 -# load_path: s3://ai2-llm/checkpoints/davidw/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B/step16600 -load_path: s3://ai2-llm/checkpoints/davidw/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_200B/step29000 - -no_pre_train_checkpoint: true -# reset_optimizer_state: false -# reset_trainer_state: true - -# NOTE(davidw) : Set these both to false since resuming training. -reset_optimizer_state: false -reset_trainer_state: false - -max_duration: 200e9T -stop_at: 47694 # round(200e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (23.2 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (23.68 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.6 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (23.53 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (23.27 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - # ~> REDDIT (23.7 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - # ~> FALCON (23.73 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy diff --git a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B.yaml b/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B.yaml deleted file mode 100644 index 8d657d688..000000000 --- a/configs/annealing/v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B.yaml +++ /dev/null @@ -1,380 +0,0 @@ -run_name: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-fix_redpajama-step_2T-resume_optimizer-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.58 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (7.01 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2.1T-resume_optimizer-steps_50B.yaml b/configs/annealing/v1.7-step_2.1T-resume_optimizer-steps_50B.yaml deleted file mode 100644 index e0762c556..000000000 --- a/configs/annealing/v1.7-step_2.1T-resume_optimizer-steps_50B.yaml +++ /dev/null @@ -1,375 +0,0 @@ -run_name: v1.7-step_2.1T-resume_optimizer-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2.1T-resume_optimizer-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2.1T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step507000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-cos_schedule-steps_50B.yaml b/configs/annealing/v1.7-step_2T-cos_schedule-steps_50B.yaml deleted file mode 100644 index 454f8523f..000000000 --- a/configs/annealing/v1.7-step_2T-cos_schedule-steps_50B.yaml +++ /dev/null @@ -1,374 +0,0 @@ -run_name: v1.7-step_2T-cos_schedule-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-cos_schedule-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_linear_envelope - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_100B.yaml b/configs/annealing/v1.7-step_2T-resume_optimizer-steps_100B.yaml deleted file mode 100644 index 571bdbd39..000000000 --- a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_100B.yaml +++ /dev/null @@ -1,375 +0,0 @@ -run_name: v1.7-step_2T-resume_optimizer-steps_100B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-resume_optimizer-steps_100B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_200B.yaml b/configs/annealing/v1.7-step_2T-resume_optimizer-steps_200B.yaml deleted file mode 100644 index 6828f6cbf..000000000 --- a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_200B.yaml +++ /dev/null @@ -1,462 +0,0 @@ -run_name: v1.7-step_2T-resume_optimizer-steps_200B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-resume_optimizer-steps_200B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 200e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (23.2 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (23.09 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00002.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00002.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00002.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00002.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.6 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (23.53 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (23.27 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - # ~> REDDIT (23.7 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - # ~> FALCON (23.73 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy diff --git a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight.yaml b/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight.yaml deleted file mode 100644 index e0b8ee6a6..000000000 --- a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight.yaml +++ /dev/null @@ -1,342 +0,0 @@ -run_name: v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-resume_optimizer-steps_50B-flan_downweight - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: false - -max_duration: 50e9T -stop_at: 11931 # round(50e9 / (1024 * 4096)) + 10 -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (8.01 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B.yaml b/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B.yaml deleted file mode 100644 index ae09a04f7..000000000 --- a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B.yaml +++ /dev/null @@ -1,375 +0,0 @@ -run_name: v1.7-step_2T-resume_optimizer-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-resume_optimizer-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B_seed_76395.yaml b/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B_seed_76395.yaml deleted file mode 100644 index fb93bc773..000000000 --- a/configs/annealing/v1.7-step_2T-resume_optimizer-steps_50B_seed_76395.yaml +++ /dev/null @@ -1,377 +0,0 @@ -run_name: v1.7-step_2T-resume_optimizer-steps_50B_seed_76395 -seed: 76395 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-resume_optimizer-steps_50B_seed_76395 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 0.00009785 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 0 - alpha_f: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: false -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 1024 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml b/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml deleted file mode 100644 index 26566243d..000000000 --- a/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml +++ /dev/null @@ -1,323 +0,0 @@ -run_name: v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: true # These both are false when resetting.. -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (9.5 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (9.4 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> REDPAJAMA ARXIV (11.3 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00002.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (9.7 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.85 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - # ~> REDDIT (10.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> FALCON (11.9 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy diff --git a/configs/annealing/v1.7-step_2T-warmup_true-steps_50B.yaml b/configs/annealing/v1.7-step_2T-warmup_true-steps_50B.yaml deleted file mode 100644 index b75ef7093..000000000 --- a/configs/annealing/v1.7-step_2T-warmup_true-steps_50B.yaml +++ /dev/null @@ -1,378 +0,0 @@ -run_name: v1.7-step_2T-warmup_true-steps_50B -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-annealing - group: v1.7-step_2T-warmup_true-steps_50B - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 # This is half the max LR from official run. - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: /data -save_overwrite: false -remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -restore_dataloader: false - -# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. -load_path: s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 50e9T -global_train_batch_size: 3072 -device_train_microbatch_size: 3 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - sharding_strategy: SHARD_GRAD_OP - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (6.75 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (7.2 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - # ~> REDPAJAMA ARXIV (6.7 GT) - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (7.3 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (11.5 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (9.0 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - # ~> REDDIT (9.4 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - # ~> FALCON (9.1 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy diff --git a/configs/c4-extra-tiny-debug.yaml b/configs/c4-extra-tiny-debug.yaml deleted file mode 100644 index 165e9dca1..000000000 --- a/configs/c4-extra-tiny-debug.yaml +++ /dev/null @@ -1,112 +0,0 @@ -run_name: extra-tiny-debug -seed: 6198 -dry_run: false - -model: - d_model: 256 - n_heads: 4 - n_layers: 5 - mlp_ratio: 4 - alibi: true - alibi_bias_max: 8.0 - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - include_bias: true - vocab_size: 50257 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: null - init_std: 0.02 - -optimizer: - name: lionw - learning_rate: 3.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 100 - alpha_f: 0.1 - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - -eval_interval: 50 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - drop_last: true - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - drop_last: true - - label: piqa - type: downstream - - label: hellaswag - type: downstream - - label: winogrande - type: downstream - - label: openbook_qa - type: downstream - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - label: sciq - type: downstream - - label: arc_easy - type: downstream - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - label: copa - type: downstream - - label: rte - type: downstream - - label: commitment_bank - type: downstream - - label: mrpc - type: downstream - - label: sst2 - type: downstream - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -save_overwrite: true - -load_path: null - -max_duration: 1708984 # 7B tokens -global_train_batch_size: 16 -device_train_microbatch_size: 4 - -precision: fp32 - -wandb: - name: ${run_name} - -speed_monitor: - window_size: 20 - -console_log_interval: 10 diff --git a/configs/c4-large.yaml b/configs/c4-large.yaml deleted file mode 100644 index 585b9ee35..000000000 --- a/configs/c4-large.yaml +++ /dev/null @@ -1,186 +0,0 @@ -run_name: c4-large-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-large - -model: - d_model: 8192 - n_heads: 32 - n_layers: 64 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: parallel - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs and doesn't work with activation checkpointing - -activation_checkpointing: true - -optimizer: - name: lionw - learning_rate: 1.0e-5 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 47684 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 4 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 2 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 2 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium.yaml b/configs/c4-medium.yaml deleted file mode 100644 index d458f57ac..000000000 --- a/configs/c4-medium.yaml +++ /dev/null @@ -1,182 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 47684 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-small.yaml b/configs/c4-small.yaml deleted file mode 100644 index c425e5ae5..000000000 --- a/configs/c4-small.yaml +++ /dev/null @@ -1,183 +0,0 @@ -run_name: c4-small-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 2.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 47684 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-tiny.yaml b/configs/c4-tiny.yaml deleted file mode 100644 index 2a5e6e4b8..000000000 --- a/configs/c4-tiny.yaml +++ /dev/null @@ -1,188 +0,0 @@ -run_name: c4-tiny-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - log_interval: ${console_log_interval} - -model: - d_model: 1024 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 1024 - include_bias: true - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs -# compile: -# mode: default - -optimizer: - name: lionw - learning_rate: 3.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 100 - alpha_f: 0.1 - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 4 # bump to 16 if on LUMI - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 100000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 3814697 # 2T tokens -global_train_batch_size: 512 -device_train_microbatch_size: 16 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -console_log_interval: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 2 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 2 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/llama7-s3.yaml b/configs/llama7-s3.yaml deleted file mode 100644 index 9205ce13f..000000000 --- a/configs/llama7-s3.yaml +++ /dev/null @@ -1,623 +0,0 @@ -run_name: llama7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: llama7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - mlp_hidden_size: 22016 - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - bias_for_layer_norm: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 3072 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - weight_tying: false - -fsdp: - precision: mixed - wrapping_strategy: size_based - sharding_strategy: SHARD_GRAD_OP - -activation_checkpointing: whole_layer - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00002.npy - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 500 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 423855 # 2T tokens -global_train_batch_size: 1536 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implementation of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implementation of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/llama7.yaml b/configs/llama7.yaml deleted file mode 100644 index 1970831ec..000000000 --- a/configs/llama7.yaml +++ /dev/null @@ -1,176 +0,0 @@ -run_name: llama7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: llama7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - mlp_hidden_size: 22016 - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - bias_for_layer_norm: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 3072 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - weight_tying: false - -fsdp: - precision: mixed - wrapping_strategy: size_based - sharding_strategy: SHARD_GRAD_OP - -activation_checkpointing: whole_layer - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 500 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 423855 # 2T tokens -global_train_batch_size: 1536 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implementation of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implementation of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml deleted file mode 100644 index d43668587..000000000 --- a/configs/llamaish1-s3.yaml +++ /dev/null @@ -1,1297 +0,0 @@ -run_name: llamaish1-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: llamaish1 - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - # mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 4.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - warmup_min_lr: 0 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 4 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml deleted file mode 100644 index ef7aef937..000000000 --- a/configs/llamaish7-s3.yaml +++ /dev/null @@ -1,1296 +0,0 @@ -run_name: llamaish7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: llamaish7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: trivia_qa_wiki_ppl - type: downstream - - - label: natural_qs_open_ppl - type: downstream - - - label: arc_easy_ppl - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/mcli/.gitignore b/configs/mcli/.gitignore deleted file mode 100644 index 3f3290eeb..000000000 --- a/configs/mcli/.gitignore +++ /dev/null @@ -1 +0,0 @@ -petew-* diff --git a/configs/mcli/ananya-1b-ib.yaml b/configs/mcli/ananya-1b-ib.yaml deleted file mode 100644 index 7e45a639b..000000000 --- a/configs/mcli/ananya-1b-ib.yaml +++ /dev/null @@ -1,4438 +0,0 @@ -run_name: olmo-1b-adamw-mitch-init - -seed: 6198 - -wandb: - name: ${run_name} - project: olmo-small - group: v1-mix - entity: ai2-llm - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - flash_attention: true - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 1.0e-3 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -# data: -# paths: ${path.glob:${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/*.npy,${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/*.npy,${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/*/*.npy,${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/*.npy,${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/*.npy,${oc.env:FLASH_DIR}/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/*.npy} -# pad_direction: right -# num_workers: 1 -# drop_last: true -# pin_memory: true -# prefetch_factor: 16 -# persistent_workers: true -# timeout: 0 - -tokenizer: - identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special - truncate_direction: right - -save_folder: /data/ananyaj/${run_name} # doesn't matter since we'll upload to S3 -save_overwrite: false -save_interval: 5000 -save_num_checkpoints_to_keep: 9 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -remote_save_folder: s3://ai2-llm/checkpoints/ananya-1b-ablations/${run_name} - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - # TODO: do we care about c4 and RP validation? We don't have these tokenized at the moment. - # - label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # - label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - type: lm - data: - paths: null - pad_direction: right - num_workers: 2 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 2 - prefetch_factor: 8 - drop_last: true - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/68_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00004.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00002.npy diff --git a/configs/mcli/ananya-1b.yaml b/configs/mcli/ananya-1b.yaml deleted file mode 100644 index 09df62978..000000000 --- a/configs/mcli/ananya-1b.yaml +++ /dev/null @@ -1,4403 +0,0 @@ -run_name: olmo-1b-${optimizer.name}-${model.init_fn}-init - -seed: 6198 - -wandb: - name: ${run_name} - project: olmo-small - group: v1-mix - entity: ai2-llm - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - flash_attention: true - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 1.0e-3 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -tokenizer: - identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special - truncate_direction: right - -save_folder: /workspace/${run_name} # doesn't matter since we'll upload to S3 -save_overwrite: false -save_interval: 5000 -save_num_checkpoints_to_keep: 9 -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 -remote_save_folder: s3://ai2-llm/checkpoints/ananya-1b-ablations/${run_name} - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} - -evaluators: - - label: all-small-ppl-validation - type: lm - data: - paths: null - pad_direction: right - num_workers: 1 - drop_last: true - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 1 - drop_last: true - prefetch_factor: 8 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/68_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00004.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00002.npy diff --git a/configs/mcli/harvest_hostnames.yaml b/configs/mcli/harvest_hostnames.yaml deleted file mode 100644 index bf88643ea..000000000 --- a/configs/mcli/harvest_hostnames.yaml +++ /dev/null @@ -1,8 +0,0 @@ -name: harvest_hostnames -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -compute: - cluster: r15z4 - gpus: 72 - gpu_type: h100_80gb - instance: oci.bm.gpu.h100.8 -command: hostname -a \ No newline at end of file diff --git a/configs/mcli/mitchish-final.yaml b/configs/mcli/mitchish-final.yaml deleted file mode 100644 index 82e45d13a..000000000 --- a/configs/mcli/mitchish-final.yaml +++ /dev/null @@ -1,110 +0,0 @@ -run_name: olmo-7b-final -image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 -gpu_num: 64 -cluster: r12z3 -#cluster: r7z2 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main - pip_install: -e . - ssh_clone: true -command: |- - # NOTE: For some reason getting S3 and R2 authentication working both from the command line and - # from Python proved to be challenging, maybe because Mosaic's server are in Australia. - # In the end I had to use separate methods to get everything working: - # 1. AWS config files for CLI access. - # 2. Environment variables for boto3 access (to S3 only). - # Since we only need CLI access prior to training, we remove the AWS config files before launching - # the training job. Otherwise the environment variables won't work. - - # Adjust these vars as needed. - #checkpoint=s3://olmo-checkpoints/ai2-llm/olmo-medium/svtto91c/step456000-unsharded - #run_name=mitchish-lumi-2T-final - checkpoint=s3://olmo-checkpoints/ai2-llm/olmo-medium/ho7jy4ey/step432410-unsharded - run_name=mitchish-mcli-2T-final - config=configs/v1_5-mix-medium-mitch-ish-s3.yaml - - # Install aws cli - apt-get update - apt-get install zip unzip - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - - cd LLM - - pip freeze - - # Prepare environment including AWS config files for both S3 and R2 access. - mkdir -p /root/.cache/torch - mkdir /root/checkpoint-unsharded - mkdir /root/data - mkdir /root/.aws - touch /root/.aws/credentials /root/.aws/config - echo '[s3]' >> /root/.aws/credentials - echo "aws_access_key_id = ${AWS_ACCESS_KEY_ID}" >> /root/.aws/credentials - echo "aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY}" >> /root/.aws/credentials - echo '' >> /root/.aws/credentials - echo '[r2]' >> /root/.aws/credentials - echo "aws_access_key_id = ${R2_ACCESS_KEY_ID}" >> /root/.aws/credentials - echo "aws_secret_access_key = ${R2_SECRET_ACCESS_KEY}" >> /root/.aws/credentials - echo "[default]" >> /root/.aws/config - echo "region = auto" >> /root/.aws/config - echo "output = json" >> /root/.aws/config - - #export S3_PROFILE=s3 - #export R2_PROFILE=r2 - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - - # Download checkpoint. - echo "Downloading checkpoint '${checkpoint}'..." - - # Download config. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/config.yaml" /root/checkpoint-unsharded/ - - # Download trainer state. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/train.pt" /root/checkpoint-unsharded/ - - # Download model weights. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/model.pt" /root/checkpoint-unsharded/ - - # Download optim state. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/optim.pt" /root/checkpoint-unsharded/ - - # Now remove the aws configs so it doesn't mess with data loading / uploading checkpoints to/from S3. - rm -rf /root/.aws - - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py ${config} \ - --run_name=${run_name} \ - --save_overwrite \ - --save_interval_unsharded=10000 \ - --load_path=/root/checkpoint-unsharded \ - --compile=null \ - --model.flash_attention=true \ - --activation_checkpointing=fine_grained \ - --fsdp.wrapping_strategy=size_based \ - --remote_save_folder=s3://ai2-llm/checkpoints/7b/${run_name} \ - --restore_dataloader=false \ - --eval_interval=100 \ - --data.paths=[s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample-9B/gpt-neox-20b-pii-special/data.npy,s3://ai2-llm/preprocessed/tulu-v2-sft-mixture/gpt-neox-20b-pii-special/data.npy] \ - --optimizer.learning_rate=0.000023 \ - --scheduler.t_warmup=432410 \ - --scheduler.alpha_f=0.001 \ - --scheduler.t_max=434633 # + 2223 diff --git a/configs/mcli/mitchish-instruct.yml b/configs/mcli/mitchish-instruct.yml deleted file mode 100644 index b65a34afe..000000000 --- a/configs/mcli/mitchish-instruct.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: olmo-7b-instruct -image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 -compute: - #cluster: r12z3 - cluster: r7z2 - gpus: 64 - gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main - pip_install: -e . - ssh_clone: true -command: |- - checkpoint=s3://olmo-checkpoints/ai2-llm/olmo-medium/wd2gxrza/step556000-unsharded - learning_rate=2e-6 - run_name=mitchish-mcli-2.5T-instruct-${learning_rate}-5ep-v2 - - # NOTE: For some reason getting S3 and R2 authentication working both from the command line and - # from Python proved to be challenging, maybe because Mosaic's server are in Australia. - # In the end I had to use separate methods to get everything working: - # 1. AWS config files for CLI access. - # 2. Environment variables for boto3 access (to S3 only). - # Since we only need CLI access prior to training, we remove the AWS config files before launching - # the training job. Otherwise the environment variables won't work. - - # Install aws cli - apt-get update - apt-get install zip unzip - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - - cd LLM - - pip freeze - - # Prepare environment including AWS config files for both S3 and R2 access. - mkdir -p /root/.cache/torch - mkdir /root/checkpoint-unsharded - mkdir /root/.aws - touch /root/.aws/credentials /root/.aws/config - echo '[s3]' >> /root/.aws/credentials - echo "aws_access_key_id = ${AWS_ACCESS_KEY_ID}" >> /root/.aws/credentials - echo "aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY}" >> /root/.aws/credentials - echo '' >> /root/.aws/credentials - echo '[r2]' >> /root/.aws/credentials - echo "aws_access_key_id = ${R2_ACCESS_KEY_ID}" >> /root/.aws/credentials - echo "aws_secret_access_key = ${R2_SECRET_ACCESS_KEY}" >> /root/.aws/credentials - echo "[default]" >> /root/.aws/config - echo "region = auto" >> /root/.aws/config - echo "output = json" >> /root/.aws/config - - #export S3_PROFILE=s3 - #export R2_PROFILE=r2 - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - - # Download checkpoint (everything except optimizer state). - echo "Downloading checkpoint '${checkpoint}'..." - - # Download config. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/config.yaml" /root/checkpoint-unsharded/ - - # Download trainer state. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/train.pt" /root/checkpoint-unsharded/ - - # Download model weights. - aws s3 cp --profile=r2 --region=auto \ - --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - "${checkpoint}/model.pt" /root/checkpoint-unsharded/ - - # Download optimizer state. - #aws s3 cp --profile=r2 --region=auto \ - # --endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \ - # "${checkpoint}/optim.pt" /root/checkpoint-unsharded/ - - # Now remove the aws configs so it doesn't mess with data loading / uploading checkpoints to/from S3. - rm -rf /root/.aws - - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py configs/mitchish-instruct.yaml \ - --run_name=${run_name} \ - --optimizer.learning_rate=${learning_rate} \ - --scheduler.grad_clip_warmup_steps=400 \ - --save_overwrite \ - --save_interval_unsharded=100000 \ - --load_path=/root/checkpoint-unsharded \ - --reset_trainer_state \ - --reset_optimizer_state \ - --compile=null \ - --model.flash_attention=true \ - --activation_checkpointing=whole_layer \ - --fsdp.wrapping_strategy=size_based \ - --max_duration=5ep diff --git a/configs/mcli/mitchish.yaml b/configs/mcli/mitchish.yaml deleted file mode 100644 index fd0887e66..000000000 --- a/configs/mcli/mitchish.yaml +++ /dev/null @@ -1,52 +0,0 @@ -run_name: olmo-7b-final -image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 -gpu_num: 64 -#cluster: r12z3 -cluster: r7z2 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main - pip_install: -e . - ssh_clone: true -command: |- - checkpoint=s3://ai2-llm/checkpoints/7b/mitchish-lumi-2T-final/step458000 - run_name=mitchish-lumi-2T-final - config=configs/v1_5-mix-medium-mitch-ish-s3.yaml - - cd LLM - - pip freeze - - # Prepare environment including AWS config files for both S3 and R2 access. - mkdir -p /root/.cache/torch - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py ${config} \ - --run_name=${run_name} \ - --save_overwrite \ - --save_interval_unsharded=10000 \ - --load_path=${checkpoint} \ - --compile=null \ - --model.flash_attention=true \ - --activation_checkpointing=fine_grained \ - --fsdp.wrapping_strategy=size_based \ - --remote_save_folder=s3://ai2-llm/checkpoints/7b/${run_name} \ - --data.paths=[s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample-9B/gpt-neox-20b-pii-special/data.npy,s3://ai2-llm/preprocessed/tulu-v2-sft-mixture/gpt-neox-20b-pii-special/data.npy] \ - --evaluators=[] \ - --optimizer.learning_rate=0.000023 \ - --scheduler.alpha_f=0.001 \ - --scheduler.t_warmup=456000 \ - --scheduler.t_max=458223 # + 2223 - - #--scheduler.t_warmup=432410 \ - #--scheduler.t_max=434633 # + 2223 diff --git a/configs/mcli/mitchish1.yaml b/configs/mcli/mitchish1.yaml deleted file mode 100644 index bcd24d3ab..000000000 --- a/configs/mcli/mitchish1.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: olmo-1b -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -compute: - cluster: r15z4 - gpus: 16 - gpu_type: h100_80gb - instance: oci.bm.gpu.h100.8 -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: train-olmo-large - pip_install: -e .[train] - ssh_clone: true - - integration_type: git_repo - git_repo: allenai/OLMo-core - git_branch: main - pip_install: -e . - ssh_clone: true -env_variables: - PIP_DISABLE_PIP_VERSION_CHECK: "1" - OMP_NUM_THREADS: "8" - LOG_FILTER_TYPE: local_rank0_only -command: |- - # Make sure we have a recent flash-attn. - # NOTE: only pinning flash-attn here to future proof it. - pip install flash-attn==2.5.3 --no-build-isolation - - # Show packages for debugging. - pip freeze - - # Prepare environment. - mkdir -p /root/.cache/torch - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - cd OLMo - - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py configs/mitchish1-s3.yaml \ - --run_name=mitchish1 \ - --wandb.group=mitchish1 \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=null \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --canceled_check_interval=50 \ - --gen1_gc_interval=8 \ - --device_train_microbatch_size=8 \ - --global_train_batch_size=512 \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --save_overwrite diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml deleted file mode 100644 index af49f6979..000000000 --- a/configs/mcli/mitchish7.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: olmo-7b -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -compute: - cluster: r15z4 - gpus: 64 - gpu_type: h100_80gb - instance: oci.bm.gpu.h100.8 -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: train-olmo-large - pip_install: -e .[train] - ssh_clone: true - - integration_type: git_repo - git_repo: allenai/OLMo-core - git_branch: main - pip_install: -e . - ssh_clone: true -env_variables: - PIP_DISABLE_PIP_VERSION_CHECK: "1" - OMP_NUM_THREADS: "8" - LOG_FILTER_TYPE: local_rank0_only -command: |- - # Install AWS CLI (for download unsharded checkpoints). - #apt-get update - #apt-get install zip unzip - #curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - #unzip awscliv2.zip - #sudo ./aws/install - - # Make sure we have a recent flash-attn. - # NOTE: only pinning flash-attn here to future proof it. - pip install flash-attn==2.5.3 --no-build-isolation - - # Show packages for debugging. - pip freeze - - # Prepare environment. - mkdir -p /root/.cache/torch - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - cd OLMo - - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py configs/mitchish7-s3.yaml \ - --run_name=mitchish7 \ - --wandb.group=mitchish7 \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --gen1_gc_interval=32 \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --save_overwrite diff --git a/configs/mcli/mitchish70-from160510.yaml b/configs/mcli/mitchish70-from160510.yaml deleted file mode 100644 index 85f853b2e..000000000 --- a/configs/mcli/mitchish70-from160510.yaml +++ /dev/null @@ -1,227 +0,0 @@ -name: olmo-70b-from160510 -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -# image: public.ecr.aws/z0f8p3z5/olmo:pytorch2.2.1_cu121-python3.11-ubuntu20.04 -# image: us-central1-docker.pkg.dev/ai2-olmo/olmo/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -scheduling: - priority: auto - # preemptible: true # means it can be retried - # max_retries: 10 -compute: - cluster: r15z4 - gpus: 896 - gpu_type: h100_80gb - instance: oci.bm.gpu.h100.8 - node_names: - - inst-ll38i-r15z3-workers - - inst-1nnph-r15z3-workers - - inst-edsue-r15z3-workers - - inst-kdmu6-r15z3-workers - - inst-tfi9t-r15z3-workers - - inst-vaqst-r15z3-workers - - inst-rpmhf-r15z3-workers - - inst-dpvjh-r15z3-workers - - inst-pfzsm-r15z3-workers - - inst-vvd97-r15z3-workers - - inst-entnk-r15z3-workers - - inst-awtjo-r15z3-workers - - inst-xdqqd-r15z3-workers - - inst-9hoiv-r15z3-workers - # - inst-mrkck-r15z3-workers # bad - - inst-jhhcv-r15z3-workers - - inst-4ki3x-r15z3-workers - - inst-bsgg4-r15z3-workers - - inst-i9qwf-r15z3-workers - - inst-daiox-r15z3-workers - - inst-ijtgf-r15z3-workers - - inst-rymxc-r15z3-workers - - inst-uou7k-r15z3-workers - - inst-6yvq9-r15z3-workers - - inst-v8mxi-r15z3-workers - - inst-kx7fu-r15z3-workers - - inst-97xv1-r15z3-workers - - inst-vy0zb-r15z3-workers - - inst-csom5-r15z3-workers - - inst-jeel7-r15z3-workers - - inst-o186f-r15z3-workers - - inst-bluc6-r15z3-workers - - inst-toizy-r15z3-workers - - inst-vwwku-r15z3-workers - # - inst-ubbqk-r15z3-workers # maybe bad - - inst-xalw1-r15z3-workers - - inst-grtmk-r15z3-workers - - inst-ytymh-r15z3-workers - - inst-e1ijl-r15z3-workers - - inst-vjsri-r15z3-workers - - inst-kc1z1-r15z3-workers - - inst-cm3ec-r15z3-workers - - inst-xtbwa-r15z3-workers - # - inst-lorl8-r15z3-workers # bad - - inst-aixwt-r15z3-workers - - inst-i6mnk-r15z3-workers - - inst-bktpo-r15z3-workers - - inst-21fqf-r15z3-workers - - inst-ed8jl-r15z3-workers - - inst-5wqam-r15z3-workers - - inst-p1vaa-r15z3-workers - - inst-f0kqy-r15z3-workers - - inst-rnyqr-r15z3-workers - - inst-fdyxp-r15z3-workers - - inst-8jhc4-r15z3-workers - - inst-nv70l-r15z3-workers - # - inst-cupyv-r15z3-workers # maybe bad - - inst-ij1rg-r15z3-workers - - inst-j3mfc-r15z3-workers - - inst-znfjw-r15z3-workers - - inst-5irk5-r15z3-workers - - inst-gn4hg-r15z3-workers - - inst-bn5zq-r15z3-workers - - inst-tw9i6-r15z3-workers - - inst-aj1o1-r15z3-workers - - inst-tturo-r15z3-workers - - inst-uwdwd-r15z3-workers - - inst-glcak-r15z3-workers - - inst-likvg-r15z3-workers - - inst-kxpsv-r15z3-workers - - inst-wrucg-r15z3-workers - - inst-xoiov-r15z3-workers - - inst-yg289-r15z3-workers - #- inst-kdqg8-r15z3-workers - - inst-0mf4w-r15z3-workers - - inst-o3fxl-r15z3-workers - - inst-fatfc-r15z3-workers - - inst-lduqx-r15z3-workers - - inst-v87vf-r15z3-workers - - inst-r01sx-r15z3-workers - - inst-i1ted-r15z3-workers - - inst-vzhyo-r15z3-workers - - inst-evbig-r15z3-workers - - inst-di0ri-r15z3-workers - - inst-w4gwj-r15z3-workers - - inst-pzgox-r15z3-workers - - inst-2oyig-r15z3-workers - - inst-rdvlq-r15z3-workers - - inst-tcttd-r15z3-workers - - inst-tg5bs-r15z3-workers - - inst-xh87c-r15z3-workers - - inst-rtaii-r15z3-workers - - inst-go2bm-r15z3-workers - - inst-8z7hr-r15z3-workers - - inst-ekaiy-r15z3-workers - - inst-ht0xx-r15z3-workers - - inst-bg14o-r15z3-workers - - inst-mrxmj-r15z3-workers - - inst-olazl-r15z3-workers - - inst-eigqe-r15z3-workers - - inst-vwnx8-r15z3-workers - - inst-hzzsd-r15z3-workers - - inst-gggd1-r15z3-workers - - inst-xmxc2-r15z3-workers - - inst-39dwb-r15z3-workers - - inst-jhqyu-r15z3-workers - - inst-pbivr-r15z3-workers - - inst-jgvhh-r15z3-workers - - inst-vv7fg-r15z3-workers - - inst-lwagu-r15z3-workers - - inst-6tz4b-r15z3-workers - - inst-jmxxa-r15z3-workers - - inst-drkao-r15z3-workers - - inst-lpz5k-r15z3-workers - - inst-bv9yy-r15z3-workers - - inst-pyzpn-r15z3-workers - - inst-ivjqi-r15z3-workers - #- inst-qc1pa-r15z3-workers - #- inst-hvw6t-r15z3-workers - #- inst-2iaxk-r15z3-workers - #- inst-dhjn2-r15z3-workers - #- inst-c6t2k-r15z3-workers - #- inst-ih7jm-r15z3-workers - #- inst-g5ojd-r15z3-workers - #- inst-irzic-r15z3-workers - #- inst-uh5f4-r15z3-workers -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: train-olmo-large - pip_install: -e .[train] - ssh_clone: true - - integration_type: git_repo - git_repo: allenai/OLMo-core - git_branch: WorksTorch22 - pip_install: -e . - ssh_clone: true -env_variables: - PIP_DISABLE_PIP_VERSION_CHECK: "1" - OMP_NUM_THREADS: "8" - LOG_FILTER_TYPE: local_rank0_only -command: |- - # Make sure we have a recent flash-attn. - # NOTE: only pinning flash-attn here to future proof it. - pip install flash-attn==2.5.3 --no-build-isolation - # Install AWS CLI (for pre-downloading unsharded checkpoints). - pip install awscli - - # Show packages for debugging. - pip freeze - - # Prepare environment. - mkdir -p /root/.cache/torch - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - #checkpoint=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step160500-unsharded-hacked - #mkdir /root/checkpoint-unsharded - #aws s3 cp --no-progress ${checkpoint}/config.yaml /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/train.pt /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/model.safetensors /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/optim.safetensors /root/checkpoint-unsharded/ - - cd OLMo - - echo "Launching train script..." - torchrun \ - --nproc_per_node 8 \ - --nnodes 112:112 \ - --rdzv_id=22232 \ - --rdzv_backend=static \ - --rdzv_endpoint=$MASTER_ADDR:29400 \ - --node_rank=$NODE_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py configs/mitchish70-s3.yaml \ - --run_name=mitchish70-from160510 \ - '--wandb.group=${run_name}' \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --load_path_sharded_checkpointer=olmo_core \ - --sharded_checkpointer=olmo_core \ - --global_train_batch_size=3584 \ - --device_train_microbatch_size=4 \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=4 \ - --time_limit=604800 \ - --save_overwrite \ - --optimizer.learning_rate=3.0e-05 \ - --scheduler.alpha_f=1.0 \ - --scheduler.t_warmup=0 \ - --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planb/step160510 - -# -# --fsdp.sharding_strategy=HYBRID_SHARD \ -# --fsdp.hybrid_sharding_num_model_replicas=4 \ -# -# '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step48950 \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step49000 \ -# --load_path=/root/checkpoint-unsharded \ -# -# gpus: 256 -# --global_train_batch_size=1536 \ -# gpus: 384 -# --global_train_batch_size=1536 \ -# --device_train_microbatch_size=2 \ -# gpus: 896 -# --global_train_batch_size=1792 \ -# gpus: 600 # (75 nodes) -# --global_train_batch_size=1800 \ diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml deleted file mode 100644 index f37bf0edf..000000000 --- a/configs/mcli/mitchish70.yaml +++ /dev/null @@ -1,96 +0,0 @@ -name: olmo-70b -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -# image: public.ecr.aws/z0f8p3z5/olmo:pytorch2.2.1_cu121-python3.11-ubuntu20.04 -# image: us-central1-docker.pkg.dev/ai2-olmo/olmo/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -scheduling: - priority: auto - # preemptible: true # means it can be retried - # max_retries: 10 -compute: - cluster: r15z4 - gpus: 896 - gpu_type: h100_80gb - instance: oci.bm.gpu.h100.8 - # node_names: -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: train-olmo-large - pip_install: -e .[train] - ssh_clone: true - - integration_type: git_repo - git_repo: allenai/OLMo-core - git_branch: WorksTorch22 - pip_install: -e . - ssh_clone: true -env_variables: - PIP_DISABLE_PIP_VERSION_CHECK: "1" - OMP_NUM_THREADS: "8" - LOG_FILTER_TYPE: local_rank0_only -command: |- - # Make sure we have a recent flash-attn. - # NOTE: only pinning flash-attn here to future proof it. - pip install flash-attn==2.5.3 --no-build-isolation - # Install AWS CLI (for pre-downloading unsharded checkpoints). - pip install awscli - - # Show packages for debugging. - pip freeze - - # Prepare environment. - mkdir -p /root/.cache/torch - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - #checkpoint=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step160500-unsharded-hacked - #mkdir /root/checkpoint-unsharded - #aws s3 cp --no-progress ${checkpoint}/config.yaml /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/train.pt /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/model.safetensors /root/checkpoint-unsharded/ - #aws s3 cp --no-progress ${checkpoint}/optim.safetensors /root/checkpoint-unsharded/ - - cd OLMo - - echo "Launching train script..." - torchrun \ - --master_addr "$MASTER_ADDR" \ - --master_port "$MASTER_PORT" \ - --nnodes "$NUM_NODES" \ - --node_rank "$NODE_RANK" \ - --nproc_per_node 8 \ - scripts/train.py configs/mitchish70-s3.yaml \ - --run_name=mitchish70-pland \ - '--wandb.group=${run_name}' \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --load_path_sharded_checkpointer=olmo_core \ - --sharded_checkpointer=olmo_core \ - --optimizer.learning_rate=0.000075 \ - --global_train_batch_size=3584 \ - --device_train_microbatch_size=4 \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=4 \ - --time_limit=604800 \ - --save_overwrite - -# -# --fsdp.sharding_strategy=HYBRID_SHARD \ -# --fsdp.hybrid_sharding_num_model_replicas=4 \ -# -# '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step48950 \ -# --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step49000 \ -# --load_path=/root/checkpoint-unsharded \ -# -# gpus: 256 -# --global_train_batch_size=1536 \ -# gpus: 384 -# --global_train_batch_size=1536 \ -# --device_train_microbatch_size=2 \ -# gpus: 896 -# --global_train_batch_size=1792 \ -# gpus: 600 # (75 nodes) -# --global_train_batch_size=1800 \ diff --git a/configs/mcli/mosaic-ananya-1b.yaml b/configs/mcli/mosaic-ananya-1b.yaml deleted file mode 100644 index 641c487d8..000000000 --- a/configs/mcli/mosaic-ananya-1b.yaml +++ /dev/null @@ -1,21 +0,0 @@ -run_name: olmo-medium-adamw-normal-init -image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -# cluster: r8z3 -cluster: r9z1 -gpu_num: 32 -# gpu_type: a100_40gb -gpu_type: h100_80gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: petew-train-updates - pip_install: -e .[all] - ssh_clone: true -command: |- - cd LLM - torchrun --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/v1-mix-small-mcli.yaml --load_path=s3://ai2-llm/checkpoints/7b/v1-mix-medium-run-001/step1000 \ No newline at end of file diff --git a/configs/mcli/olmo7-ablation-baseline.yaml b/configs/mcli/olmo7-ablation-baseline.yaml deleted file mode 100644 index 805138d98..000000000 --- a/configs/mcli/olmo7-ablation-baseline.yaml +++ /dev/null @@ -1,47 +0,0 @@ -name: olmo7-ablation-baseline # can't have "_" or "." here -image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 -compute: - gpus: 64 - cluster: r7z2 - gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: olmo7-ablations - #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729 - pip_install: -e .[train] - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=all_ranks - #export OLMO_NO_SSL=1 - - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - cd OLMo - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/olmo7-ablation-baseline.yaml \ - --run_name=olmo7-ablation-baseline \ - --wandb.name=baseline \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=FULL_SHARD \ - --save_folder=runs/ \ - --activation_checkpointing=whole_layer \ - --device_train_microbatch_size=3 \ - --global_train_batch_size=6144 \ - --wandb.group=baseline3 \ - --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/baseline3 diff --git a/configs/mcli/olmo7-ablation-dedupedocs.yaml b/configs/mcli/olmo7-ablation-dedupedocs.yaml deleted file mode 100644 index ccd84be45..000000000 --- a/configs/mcli/olmo7-ablation-dedupedocs.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: olmo7-ablation-dedupedocs # can't have "_" or "." here -image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 -compute: - gpus: 64 - cluster: r14z3p2 - gpu_type: h100_80gb -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: olmo7-ablations - #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729 - pip_install: -e .[train] - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=all_ranks - #export OLMO_NO_SSL=1 - - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - cd OLMo - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/olmo7-ablation-dedupedocs.yaml \ - --run_name=olmo7-ablation-dedupedocs \ - --wandb.name=dedupedocs \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --device_train_microbatch_size=3 \ - --global_train_batch_size=6144 \ - --wandb.group=dedupedocs \ - --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/dedupedocs diff --git a/configs/mcli/olmo7-ablation-dolma17.yaml b/configs/mcli/olmo7-ablation-dolma17.yaml deleted file mode 100644 index 30c3b70ec..000000000 --- a/configs/mcli/olmo7-ablation-dolma17.yaml +++ /dev/null @@ -1,47 +0,0 @@ -name: olmo7-ablation-dolma17 # can't have "_" or "." here -image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 -compute: - gpus: 128 - cluster: r12z3 - gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/OLMo - git_branch: olmo7-ablations - #git_commit: d765e8819f5b0be204c96b0b519de2372b0da729 - pip_install: -e .[train] - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=all_ranks - #export OLMO_NO_SSL=1 - - # warm up huggingface cache - pushd /root/.cache - curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache.tar.gz" | tar -xzf - - popd - export HF_DATASETS_OFFLINE=1 - - cd OLMo - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/olmo7-ablation-dolma17.yaml \ - --run_name=olmo7-ablation-dolma17 \ - --wandb.name=dolma17 \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=FULL_SHARD \ - --save_folder=runs/ \ - --activation_checkpointing=whole_layer \ - --device_train_microbatch_size=3 \ - --global_train_batch_size=6144 \ - --wandb.group=dolma17 \ - --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/dolma17 diff --git a/configs/mcli/v1-mix-medium-mitch-ish.yaml b/configs/mcli/v1-mix-medium-mitch-ish.yaml deleted file mode 100644 index 76de11536..000000000 --- a/configs/mcli/v1-mix-medium-mitch-ish.yaml +++ /dev/null @@ -1,32 +0,0 @@ -run_name: v1-mix-medium-mitch-ish -image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -gpu_num: 216 -cluster: r12z3 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main # make sure to update this! - pip_install: -e . - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - export OLMO_NO_SSL=1 # we get SSLErrors all the time on this cluster - #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 - - cd LLM - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/v1-mix-medium-mitch-ish-s3.yaml \ - --run_name=v1-mix-mitch-ish \ - --model.flash_attention=true \ - --global_train_batch_size=2160 diff --git a/configs/mcli/v1-mix-medium.yaml b/configs/mcli/v1-mix-medium.yaml deleted file mode 100644 index 080b86483..000000000 --- a/configs/mcli/v1-mix-medium.yaml +++ /dev/null @@ -1,33 +0,0 @@ -run_name: v1-mix-medium -image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -gpu_num: 216 -cluster: r12z3 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main # make sure to update this! - pip_install: -e . - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - export OLMO_NO_SSL=1 # we get SSLErrors all the time on this cluster - #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 - - cd LLM - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/v1-mix-medium-s3.yaml \ - --run_name=v1-mix-medium \ - --model.flash_attention=true \ - --scheduler.name=linear_with_warmup \ - --global_train_batch_size=2160 diff --git a/configs/mcli/v1_5-mix-medium-mitch-ish.yaml b/configs/mcli/v1_5-mix-medium-mitch-ish.yaml deleted file mode 100644 index d8d529d9a..000000000 --- a/configs/mcli/v1_5-mix-medium-mitch-ish.yaml +++ /dev/null @@ -1,47 +0,0 @@ -run_name: v1-5-mix-medium-mitch-ish # can't have "_" or "." here -image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -gpu_num: 216 -cluster: r12z3 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - # git_branch: mitchish - git_commit: 148ca062e7f1f7667d7fc0f4346e97467e66ce87 - pip_install: -e . - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - #export OLMO_NO_SSL=1 - #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 - - cd LLM - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/v1_5-mix-medium-mitch-ish-s3.yaml \ - --run_name=v1_5-mix-mitch-ish \ - --wandb.name=v1_5-mix-mitch-ish-mcli-final \ - --global_train_batch_size=2160 \ - --model.flash_attention=true \ - --time_limit=169200 - -# We added these flags in order to get a final checkpoint where we decayed the LR down to 0. -# --eval_interval=100 \ -# --save_interval=500 \ -# --load_path=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish/step556000 \ -# --remote_save_folder=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish-final \ -# --epoch=1 \ -# --optimizer.learning_rate=0.000023 \ -# --scheduler.t_warmup=556000 \ -# --scheduler.t_max=557000 \ -# --scheduler.alpha_f=0.001 \ -# --stop_at=557001 diff --git a/configs/mcli/v1_5-mix-medium.yaml b/configs/mcli/v1_5-mix-medium.yaml deleted file mode 100644 index 9eb6b26ea..000000000 --- a/configs/mcli/v1_5-mix-medium.yaml +++ /dev/null @@ -1,33 +0,0 @@ -run_name: v1-5-mix-medium # can't use underscores or "." -image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 -gpu_num: 216 -cluster: r12z3 -gpu_type: a100_40gb -integrations: - - integration_type: git_repo - git_repo: allenai/LLM - git_branch: main # make sure to update this! - pip_install: -e . - ssh_clone: true -command: |- - pip freeze - mkdir -p /root/.cache/torch/ - - export OMP_NUM_THREADS=8 - export LOG_FILTER_TYPE=local_rank0_only - export OLMO_NO_SSL=1 # we get SSLErrors all the time on this cluster - #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 - - cd LLM - - torchrun \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --nproc_per_node 8 \ - scripts/train.py configs/v1_5-mix-medium-s3.yaml \ - --run_name=v1_5-mix-mcli \ - --scheduler.name=linear_with_warmup \ - --model.flash_attention=true \ - --global_train_batch_size=2160 diff --git a/configs/mitchish-instruct.yaml b/configs/mitchish-instruct.yaml deleted file mode 100644 index ad247e7bc..000000000 --- a/configs/mitchish-instruct.yaml +++ /dev/null @@ -1,148 +0,0 @@ -run_name: v1_5-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 2e-5 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 200 - alpha_f: 0.001 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/7b/${run_name} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 128 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - generate_attention_mask: true - paths: - - s3://ai2-llm/preprocessed/tulu-v2-fine-tune/gpt-neox-20b-pii-special/input_ids.npy - label_mask_paths: - - s3://ai2-llm/preprocessed/tulu-v2-fine-tune/gpt-neox-20b-pii-special/label_mask.npy diff --git a/configs/mitchish1-s3.yaml b/configs/mitchish1-s3.yaml deleted file mode 100644 index d67b2bbe8..000000000 --- a/configs/mitchish1-s3.yaml +++ /dev/null @@ -1,1277 +0,0 @@ -run_name: mitchish1-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: mitchish1 - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 4.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: null - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 64 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/mitchish35.yaml b/configs/mitchish35.yaml deleted file mode 100644 index 08a7e6ec1..000000000 --- a/configs/mitchish35.yaml +++ /dev/null @@ -1,183 +0,0 @@ -run_name: mitchish35-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish35 - -model: - d_model: 7168 - n_heads: 56 - n_layers: 56 - # mlp_ratio: 6 - mlp_hidden_size: 37888 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false - -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 953674 # 2T tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: sst2 - type: downstream - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 diff --git a/configs/mitchish50.yaml b/configs/mitchish50.yaml deleted file mode 100644 index ab0f1880f..000000000 --- a/configs/mitchish50.yaml +++ /dev/null @@ -1,183 +0,0 @@ -run_name: mitchish50-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish50 - -model: - d_model: 8192 - n_heads: 64 - n_layers: 64 - # mlp_ratio: 6 - mlp_hidden_size: 40960 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false - -# Sharded checkpoints (best for restarts) -save_interval: 500 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 953674 # 2T tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: sst2 - type: downstream - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 diff --git a/configs/mitchish65-s3.yaml b/configs/mitchish65-s3.yaml deleted file mode 100644 index f8e7fa84a..000000000 --- a/configs/mitchish65-s3.yaml +++ /dev/null @@ -1,634 +0,0 @@ -run_name: mitchish65-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish65 - -model: - d_model: 8192 - n_heads: 64 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 44032 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -# disabled because we are ablating FA2 against a baseline that ran in LUMI -#compile: -# fullgraph: false - - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 2e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/65b/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 50 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2e12T -global_train_batch_size: 1024 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-000-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-001-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-002-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-003-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-004-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-005-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-006-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-007-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-008-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-008-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-009-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-010-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-010-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-011-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-012-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-013-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-013-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-014-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-014-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-015-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-016-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-017-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-018-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-019-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-020-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-021-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-022-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-023-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-024-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-025-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-025-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-026-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-027-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-027-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-028-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-028-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-029-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-030-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-031-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-032-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-033-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-033-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-034-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-034-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-035-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-036-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-037-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-038-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-039-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-040-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-041-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-042-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-042-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-043-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-043-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-044-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-044-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-045-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-046-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-046-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-046-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-047-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-048-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-049-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-050-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-051-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-052-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-052-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-053-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-053-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-054-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-055-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-055-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-056-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-056-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-057-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-057-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-058-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-059-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-060-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-061-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-062-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-062-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-063-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-063-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-064-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-064-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-065-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-065-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-066-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-067-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-068-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-069-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-070-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-071-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-072-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-073-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-074-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-075-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-076-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-077-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-078-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-079-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-080-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-081-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-082-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-083-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-084-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-085-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-086-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-087-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-088-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-089-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-089-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-090-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-091-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-091-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-092-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-093-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-093-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-094-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-094-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-095-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-096-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-097-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-097-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-098-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-099-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-100-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-100-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-101-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-102-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-103-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-104-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-105-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-106-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-106-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-107-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-108-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-109-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-109-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-110-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-110-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-111-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-112-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-113-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-113-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-114-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-114-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-115-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-116-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-117-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-118-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-119-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-120-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-120-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-121-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-122-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-122-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-122-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-123-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-123-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-124-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-125-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-126-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-127-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-127-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-128-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-129-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-129-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-130-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-131-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-132-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-133-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-133-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-134-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-134-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-135-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-135-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-136-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-137-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-137-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-138-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-139-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-140-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-141-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-141-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-142-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-142-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-143-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-144-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-144-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-145-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-145-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-146-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-146-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-147-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-147-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-148-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-149-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-149-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-150-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-150-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-150-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-151-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-152-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-153-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-154-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-155-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-155-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-156-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-157-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-157-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-158-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-159-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-160-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-160-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-161-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-161-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-162-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-163-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-164-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-165-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-165-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-166-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-166-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-167-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-167-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-168-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-169-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-170-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-171-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-172-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-173-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-173-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-174-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-174-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-175-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-175-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-176-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-176-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-177-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-178-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-179-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-180-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-181-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-182-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-182-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-183-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-183-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-184-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-185-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-185-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-186-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-186-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-187-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/part-187-00002.npy - diff --git a/configs/mitchish65.yaml b/configs/mitchish65.yaml deleted file mode 100644 index 71339a081..000000000 --- a/configs/mitchish65.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: mitchish65-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish65 - -model: - d_model: 8192 - n_heads: 64 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 44032 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 2e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 50 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2e12T -global_train_batch_size: 2048 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: sst2 - type: downstream - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 diff --git a/configs/mitchish7-llamainit-s3.yaml b/configs/mitchish7-llamainit-s3.yaml deleted file mode 100644 index 6518747e6..000000000 --- a/configs/mitchish7-llamainit-s3.yaml +++ /dev/null @@ -1,1280 +0,0 @@ -run_name: mitchish7-llamainit-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7-llamainit - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/mitchish7-s3.yaml b/configs/mitchish7-s3.yaml deleted file mode 100644 index de6bfb37e..000000000 --- a/configs/mitchish7-s3.yaml +++ /dev/null @@ -1,1282 +0,0 @@ -run_name: mitchish7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: mitchish7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/mitchish70-s3.yaml b/configs/mitchish70-s3.yaml deleted file mode 100644 index 3fa58d488..000000000 --- a/configs/mitchish70-s3.yaml +++ /dev/null @@ -1,1273 +0,0 @@ -run_name: mitchish70-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish70 - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 50279 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-large/${run_name} -save_overwrite: false - -save_interval: 250 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 3e12T -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 500 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - -data: - pad_direction: right - num_workers: 8 - drop_last: true - pin_memory: true - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/mitchish70.yaml b/configs/mitchish70.yaml deleted file mode 100644 index 6f9dac6b2..000000000 --- a/configs/mitchish70.yaml +++ /dev/null @@ -1,201 +0,0 @@ -run_name: mitchish70-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-large - group: mitchish70 - -model: - d_model: 8192 - n_heads: 64 - n_kv_heads: 8 - n_layers: 80 - # mlp_ratio: 6 - mlp_hidden_size: 57344 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 2e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - # TODO: this should be: - #identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: 50 -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: local - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2e12T -global_train_batch_size: 512 -device_train_microbatch_size: 1 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -activation_checkpointing: whole_layer - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: v2-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: basic_arithmetic - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - -data: - # TODO: update these - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special-longrunfix/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true diff --git a/configs/olmo-small-ablation.yaml b/configs/olmo-small-ablation.yaml deleted file mode 100644 index 4e46f9289..000000000 --- a/configs/olmo-small-ablation.yaml +++ /dev/null @@ -1,249 +0,0 @@ -run_name: olmo-small-ablation -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50277 - embedding_size: 50304 - eos_token_id: 50276 - pad_token_id: 50276 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: EleutherAI/gpt-neox-20b - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 953674 # 2T tokens -max_duration: 95367 # 200B tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - #- label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - #- label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - - label: 4chan-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - drop_last: true - - - label: c4_100_domains-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - drop_last: true - - - label: c4_en-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - drop_last: true - - - label: gab-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - drop_last: true - - - label: ice-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - drop_last: true - - - label: m2d2_s2orc-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - drop_last: true - - - label: m2d2_wiki-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - drop_last: true - - - label: manosphere-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - drop_last: true - - - label: mc4_en-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - drop_last: true - - - label: pile-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - drop_last: true - - - label: stack_v2_held_out - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/stack_v2_held_out/000_00000.npy - drop_last: true - - - label: openai_humaneval_test - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/openai_humaneval_test/0_00000.npy - drop_last: true - - - label: mbpp_valid - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/mbpp_valid/0_00000.npy - drop_last: true - - # Too small (not enough tokens for a single batch) - # - label: ptb-validation - # data: - # paths: - # - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - # drop_last: true - - - label: twitterAEE-validation - data: - paths: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - drop_last: true - - # Too small (not enough tokens for a single batch) - # - label: wikitext_103-validation - # data: - # paths: - # - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - # drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/olmo7-ablation-baseline.yaml b/configs/olmo7-ablation-baseline.yaml deleted file mode 100644 index bfe40e48b..000000000 --- a/configs/olmo7-ablation-baseline.yaml +++ /dev/null @@ -1,640 +0,0 @@ -run_name: olmo7-ablation -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00002.npy diff --git a/configs/olmo7-ablation-dedupedocs.yaml b/configs/olmo7-ablation-dedupedocs.yaml deleted file mode 100644 index 6cd75f2b6..000000000 --- a/configs/olmo7-ablation-dedupedocs.yaml +++ /dev/null @@ -1,1618 +0,0 @@ -run_name: olmo7-ablation-dedupedocs -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation-dedupedocs - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V0 (1.84 GT) - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (157.2 GT) - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-78-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-79-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-80-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-81-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-82-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-83-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-84-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-85-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-86-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-87-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-88-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-89-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-90-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-91-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-92-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-93-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-94-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-95-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_doc_le030/gpt-neox-olmo-dolma-v1_5/part-96-00000.npy - # ~> REDDIT (79.988 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00001.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (187.2 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-188-00000.npy - # ~> DOLMA CC TAIL 33% (268.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_doc_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy \ No newline at end of file diff --git a/configs/olmo7-ablation-dedupeparas.yaml b/configs/olmo7-ablation-dedupeparas.yaml deleted file mode 100644 index 663c91a41..000000000 --- a/configs/olmo7-ablation-dedupeparas.yaml +++ /dev/null @@ -1,1625 +0,0 @@ -run_name: olmo7-ablation-dedupedocs -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation-dedupedocs - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V0 (1.84 GT) - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (157.2 GT) - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-78-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-79-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-80-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-81-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-82-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-83-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-84-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-85-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-86-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-87-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-88-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-89-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-90-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-91-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-92-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-93-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-94-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-95-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-96-00000.npy - # ~> REDDIT (79.988 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00001.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (187.2 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - # ~> DOLMA CC TAIL 33% (268.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy \ No newline at end of file diff --git a/configs/olmo7-ablation-dolma17.yaml b/configs/olmo7-ablation-dolma17.yaml deleted file mode 100644 index 8ae01387d..000000000 --- a/configs/olmo7-ablation-dolma17.yaml +++ /dev/null @@ -1,1491 +0,0 @@ -run_name: olmo7-ablation-dolma17 -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation-dolma17 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V0 (1.84 GT) - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (174.418 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-78-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-79-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-80-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-81-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-82-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-83-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-84-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/c4/gpt-neox-olmo-dolma-v1_5/part-85-00000.npy - # ~> REDDIT (79.988 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00001.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 33% (192.264 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - # ~> DOLMA CC MIDDLE 33% (189.606 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - # ~> DOLMA CC TAIL 33% (294.252 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy \ No newline at end of file diff --git a/configs/olmo7-ablation-final2.yaml b/configs/olmo7-ablation-final2.yaml deleted file mode 100644 index 6e0b465e5..000000000 --- a/configs/olmo7-ablation-final2.yaml +++ /dev/null @@ -1,1258 +0,0 @@ -run_name: olmo7-ablation-final2 -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation-final2 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy \ No newline at end of file diff --git a/configs/olmo7-ablation-refheavy.yaml b/configs/olmo7-ablation-refheavy.yaml deleted file mode 100644 index e764892c9..000000000 --- a/configs/olmo7-ablation-refheavy.yaml +++ /dev/null @@ -1,1704 +0,0 @@ -run_name: olmo7-ablation-refheavy -seed: 61394 -dry_run: false - -wandb: - name: ${run_name} - project: olmo7-ablations - group: olmo7-ablation-refheavy - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 1.5e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 1000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 200 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/hrshlkzq/step119000-unsharded/ # 0.5T -load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/j18wauyq/step238000-unsharded/ # 1.0T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/4xel5n7e/step358000-unsharded/ # 1.5T -#load_path: r2://olmo-checkpoints/ai2-llm/olmo-medium/xtruaap8/step477000-unsharded/ # 2.0T - -no_pre_train_checkpoint: true -reset_optimizer_state: true -reset_trainer_state: true - -max_duration: 100e9T -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: mmlu_stem - type: downstream - - - label: mmlu_humanities - type: downstream - - - label: mmlu_social_sciences - type: downstream - - - label: mmlu_other - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - #- label: copa - # type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: mrpc - # type: downstream - - #- label: sst2 - # type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT x 3) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT x 2) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT x 3) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # ~> REDPAJAMA STACKEXCHANGE (19.63 GT x 2) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> CC NEWS (15 GT x 2) - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v2/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - # ~> REDPAJAMA ARXIV (19.63 GT x 2) - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT x 2) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V0 (1.84 GT x 5) - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v0_all_train/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (157.2 GT) - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-78-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-79-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-80-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-81-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-82-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-83-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-84-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-85-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-86-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-87-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-88-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-89-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-90-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-91-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-92-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-93-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-94-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-95-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_dd_ngram_docpara_le030/gpt-neox-olmo-dolma-v1_5/part-96-00000.npy - # ~> REDDIT (79.988 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/reddit/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-000-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-001-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-002-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-003-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-004-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-005-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-008-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-009-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-011-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-012-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-015-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-016-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-017-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-018-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-019-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-020-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-021-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-022-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-023-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-024-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-025-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-026-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-029-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-030-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-031-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-032-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-034-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-035-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-036-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-037-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-038-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-039-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-040-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-041-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-043-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-045-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-047-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-048-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-049-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-050-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-051-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-054-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-055-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-058-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-059-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-060-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-061-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-064-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-066-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-067-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-068-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-069-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-070-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-071-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-072-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-073-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-074-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-075-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-076-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-077-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-078-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-079-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-080-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-081-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-082-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-083-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-084-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-085-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-086-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-087-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-088-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-090-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-092-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-094-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-095-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-096-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-098-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-099-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-101-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-102-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-103-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-104-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-105-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-107-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-108-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-111-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-112-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-113-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-115-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-116-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-117-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-118-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-119-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-120-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-121-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-124-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-125-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-126-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-128-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-129-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-130-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-131-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-132-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-133-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-136-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-137-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-138-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-139-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-140-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-142-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-143-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-146-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-148-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-151-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-152-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-153-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-154-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-155-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-156-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-158-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-159-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-160-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-162-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-163-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-164-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-168-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-169-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-170-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-171-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-172-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-176-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-177-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-178-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-179-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-180-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-181-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-184-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-185-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-187-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-188-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-189-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-190-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-191-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-192-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-193-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-194-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-195-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-196-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-197-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-198-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-199-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-200-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-201-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-202-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-203-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-204-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-205-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-206-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-207-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-208-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-209-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-210-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-211-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-212-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-213-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-214-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-215-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-216-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-217-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-218-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-219-00003.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-220-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-221-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-222-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00001.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-223-00002.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special/part-224-00001.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 35% (127.9 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - # ~> DOLMA CC MIDDLE 35 (164.5 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - # ~> DOLMA CC TAIL 35% (179.1 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7_dd_ngram_docpara_le030_decontam/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy \ No newline at end of file diff --git a/configs/pile-llamaish7-s3.yaml b/configs/pile-llamaish7-s3.yaml deleted file mode 100644 index bf96bd5c4..000000000 --- a/configs/pile-llamaish7-s3.yaml +++ /dev/null @@ -1,528 +0,0 @@ -run_name: pile-llamaish7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: pile-llamaish7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### PILE ######### - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-149-00001.npy diff --git a/configs/pile-llamaish7.yaml b/configs/pile-llamaish7.yaml deleted file mode 100644 index b7e1cdcc4..000000000 --- a/configs/pile-llamaish7.yaml +++ /dev/null @@ -1,227 +0,0 @@ -run_name: pile-llamaish7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: pile-llamaish7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: ${path.glob:${oc.env:DATA_PATH}/pile/gpt-neox-20b-pii-special/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 \ No newline at end of file diff --git a/configs/tiny-llamaish-s3.yaml b/configs/tiny-llamaish-s3.yaml deleted file mode 100644 index 22151a3de..000000000 --- a/configs/tiny-llamaish-s3.yaml +++ /dev/null @@ -1,1284 +0,0 @@ -run_name: tiny-llamaish-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: tiny-llamaish - -model: - d_model: 256 - n_heads: 2 - n_layers: 2 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/configs/v1-mix-medium-mitch-ish-s3.yaml b/configs/v1-mix-medium-mitch-ish-s3.yaml deleted file mode 100644 index 627ce0354..000000000 --- a/configs/v1-mix-medium-mitch-ish-s3.yaml +++ /dev/null @@ -1,4420 +0,0 @@ -run_name: v1-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/7b/${run_name} -save_overwrite: true -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/68_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00004.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00002.npy diff --git a/configs/v1-mix-medium-mitch-ish.yaml b/configs/v1-mix-medium-mitch-ish.yaml deleted file mode 100644 index 442d1ec2c..000000000 --- a/configs/v1-mix-medium-mitch-ish.yaml +++ /dev/null @@ -1,170 +0,0 @@ -run_name: v1-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 -fsdp: - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/books/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/c4/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/common-crawl/*/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/s2/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/stack/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/wiki/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/v1-mix-medium-s3.yaml b/configs/v1-mix-medium-s3.yaml deleted file mode 100644 index 0bed9bfc8..000000000 --- a/configs/v1-mix-medium-s3.yaml +++ /dev/null @@ -1,4443 +0,0 @@ -run_name: v1-mix-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1-mix - -model: - d_model: 4096 - n_heads: 16 - n_layers: 29 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - flash_attention: false # can't be used with ALiBi - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/7b/${run_name} -save_overwrite: true -time_limit: null -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: pure - -max_grad_norm: 1.0 -max_grad_norm_ratio: 1.5 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - # TODO: do we care about c4 and RP validation? We don't have these tokenized at the moment. - # - label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # - label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/68_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00004.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00002.npy diff --git a/configs/v1-mix-medium.yaml b/configs/v1-mix-medium.yaml deleted file mode 100644 index 01bc60441..000000000 --- a/configs/v1-mix-medium.yaml +++ /dev/null @@ -1,165 +0,0 @@ -run_name: v1-mix-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1-mix - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: sequential - layer_norm_type: amd_compatible - layer_norm_with_affine: false - bias_for_layer_norm: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/books/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/c4/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/common-crawl/*/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/s2/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/stack/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/wiki/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 -max_grad_norm_ratio: 1.5 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/v1-mix-small-s3.yaml b/configs/v1-mix-small-s3.yaml deleted file mode 100644 index 375526890..000000000 --- a/configs/v1-mix-small-s3.yaml +++ /dev/null @@ -1,4435 +0,0 @@ -run_name: olmo-small-adamw-mitch-init -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: v1-mix - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: sequential - layer_norm_type: low_precision - layer_norm_with_affine: true # workaround for the layer norm bug - bias_for_layer_norm: true # workaround for the layer norm bug - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 1.0e-3 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/1b/${run_name} -save_overwrite: true -time_limit: null -# Sharded checkpoints (best for restarts) -save_interval: 100 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - # TODO: do we care about c4 and RP validation? We don't have these tokenized at the moment. - # - label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # - label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/books/2_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/00_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/01_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/02_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/03_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/04_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/05_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/06_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/07_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/08_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/09_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/10_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/11_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/12_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/13_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/14_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/15_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/16_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/17_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/18_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/19_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/20_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/21_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/22_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/23_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/24_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/25_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/26_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/27_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/28_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/29_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/30_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/31_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/32_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/33_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/34_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/35_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/36_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/37_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/38_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/39_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/40_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/41_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/42_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/43_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/44_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/45_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/46_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/47_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/48_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/49_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/50_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/51_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/52_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/53_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/54_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/55_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/56_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/57_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/58_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/59_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/60_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/61_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/62_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/63_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/64_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/65_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/66_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/67_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/68_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/69_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/70_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/71_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/72_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/73_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/74_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/75_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/76_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/77_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/78_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/79_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/80_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/81_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/82_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/83_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/84_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/c4/85_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/340_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/527_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/528_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/529_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/530_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/531_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/532_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/533_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/534_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/535_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/536_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/537_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/538_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/539_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/540_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle/541_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/096_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/268_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/269_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/270_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/271_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/272_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/273_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/274_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/275_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/276_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/277_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/278_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/279_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/280_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/281_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/282_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/283_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/284_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/285_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/286_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/287_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/288_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/289_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/290_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/291_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/292_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/293_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/294_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/295_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/296_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/297_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/298_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/299_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/300_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/301_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/302_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/303_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/304_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/305_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/306_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/307_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/308_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/309_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/310_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/311_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/312_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/313_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/314_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/315_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/316_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/317_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/318_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/319_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/320_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/321_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/322_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/323_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/324_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/325_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/326_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/327_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/328_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/329_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/330_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/331_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/332_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/333_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/334_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/335_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/336_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/337_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/338_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/339_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/340_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/341_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/342_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/343_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/344_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/345_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/346_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/347_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/348_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/349_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/350_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/351_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/352_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/353_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/354_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/355_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/356_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/357_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/358_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/359_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/360_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/361_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/362_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/363_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/364_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/365_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/366_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/367_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/368_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/369_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/370_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/371_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/372_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/373_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/374_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/375_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/376_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/377_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/378_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/379_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/380_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/381_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/382_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/383_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/384_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/385_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/386_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/387_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/388_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/389_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/390_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/391_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/392_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/393_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/394_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/395_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/396_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/397_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/398_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/399_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/400_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/401_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/402_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/403_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/404_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/405_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/406_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/407_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/408_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/409_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/410_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/411_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/412_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/413_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/414_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/415_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/416_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/417_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/418_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/419_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/420_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/421_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/422_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/423_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/424_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/425_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/426_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/427_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/428_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/429_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/430_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/431_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/432_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/433_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/434_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/435_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/436_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/437_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/438_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/439_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/440_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/441_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/442_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/443_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/444_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/445_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/446_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/447_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/448_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/449_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/450_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/451_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/452_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/453_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/454_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/455_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/456_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/457_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/458_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/459_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/460_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/461_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/462_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/463_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/464_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/465_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/466_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/467_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/468_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/469_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/470_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/471_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/472_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/473_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/474_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/475_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/476_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/477_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/478_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/479_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/480_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/481_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/482_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/483_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/484_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/485_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/486_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/487_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/488_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/489_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/490_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/491_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/492_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/493_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/494_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/495_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/496_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/497_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/498_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/499_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/500_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/501_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/502_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/503_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/504_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/505_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/506_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/507_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/508_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/509_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/510_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/511_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/512_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/513_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/514_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/515_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/516_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/517_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/518_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/519_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/520_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/521_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/522_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/523_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/524_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/525_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail/526_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/00_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/01_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/02_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/03_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/04_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/05_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/06_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/07_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/08_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/09_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/10_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/11_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/12_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/13_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/14_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/15_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/16_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/17_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/18_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/19_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/20_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/21_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/22_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/23_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/24_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/25_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/26_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/27_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/28_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/29_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/30_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/31_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/32_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/33_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/34_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/35_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/36_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/37_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/38_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/39_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/40_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/s2/41_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/000_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/001_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/002_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/003_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/004_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/005_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/006_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/007_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/008_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/009_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/010_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/011_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/012_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/013_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/014_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/015_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/016_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/017_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/018_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/019_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/020_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/021_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/022_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/023_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/024_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/025_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/026_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/027_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/028_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/029_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/030_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/031_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/032_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/033_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/034_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/035_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/036_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/037_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/038_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/039_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/040_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/041_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/042_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/043_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/044_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/045_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/046_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/047_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/048_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/049_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/050_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/051_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/052_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/053_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/054_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/055_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/056_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/057_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/058_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/059_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/060_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/061_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/062_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/063_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/064_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/065_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/066_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/067_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/068_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/069_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/070_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/071_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/072_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/073_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/074_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/075_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/076_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/077_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/078_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/079_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/080_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/081_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/082_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/083_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/084_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/085_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/086_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/087_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/088_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/089_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/090_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/091_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/092_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/093_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/094_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/095_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/096_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/097_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/098_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/099_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/100_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/101_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/102_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/103_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/104_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/105_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/106_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/107_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/108_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/109_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/110_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/111_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/112_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/113_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/114_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/115_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/116_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/117_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/118_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/119_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/120_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/121_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/122_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/123_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/124_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/125_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/126_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/127_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/128_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/129_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/130_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/131_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/132_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/133_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/134_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/135_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/136_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/137_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/138_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/139_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/140_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/141_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/142_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/143_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/144_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/145_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/146_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/147_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/148_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/149_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/150_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/151_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/152_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/153_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/154_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/155_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/156_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/157_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/158_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/159_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/160_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/161_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/162_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/163_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/164_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/165_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/166_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/167_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/168_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/169_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/170_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/171_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/172_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/173_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/174_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/175_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/176_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/177_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/178_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/179_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/180_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/181_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/182_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/183_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/184_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/185_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/186_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/187_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/188_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/189_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/190_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/191_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/192_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/193_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/194_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/195_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/196_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/197_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/198_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/199_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/200_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/201_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/202_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/203_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/204_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/205_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/206_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/207_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/208_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/209_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/210_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/211_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/212_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/213_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/214_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/215_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/216_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/217_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/218_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/219_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/220_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/221_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/222_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/223_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/224_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/225_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/226_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/227_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/228_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/229_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/230_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/231_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/232_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/233_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/234_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/235_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/236_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/237_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/238_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/239_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/240_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/241_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/242_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/243_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/244_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/245_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/246_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/247_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/248_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/249_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/250_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/251_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/252_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/253_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/254_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/255_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/256_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/257_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/258_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/259_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/260_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/261_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/262_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/263_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/264_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/265_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/266_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack/267_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/0_00004.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/wiki/1_00002.npy diff --git a/configs/v1-mix-small.yaml b/configs/v1-mix-small.yaml deleted file mode 100644 index 1a124ce4b..000000000 --- a/configs/v1-mix-small.yaml +++ /dev/null @@ -1,187 +0,0 @@ -run_name: v1-mix-small-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: v1-mix - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - include_bias: false - block_type: sequential - layer_norm_type: low_precision - layer_norm_with_affine: true # workaround for the layer norm bug - bias_for_layer_norm: true # workaround for the layer norm bug - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - -compile: null # causes instability on AMD GPUs - -optimizer: - name: adamw - learning_rate: 1.0e-3 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/books/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/c4/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/common-crawl/*/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/s2/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/stack/*.npy,${oc.env:DATA_PATH}/v1-sample/gpt-neox-20b-pii-special/wiki/*.npy} - pad_direction: right - num_workers: 1 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 5000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - # TODO: do we care about c4 and RP validation? We don't have these tokenized at the moment. - # - label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # - label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/v1_5-mix-medium-mitch-ish-s3.yaml b/configs/v1_5-mix-medium-mitch-ish-s3.yaml deleted file mode 100644 index d39a6bd41..000000000 --- a/configs/v1_5-mix-medium-mitch-ish-s3.yaml +++ /dev/null @@ -1,625 +0,0 @@ -run_name: v1_5-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 10.0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/7b/${run_name} -save_overwrite: true -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2e12T # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 -time_limit: null - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00002.npy diff --git a/configs/v1_5-mix-medium-mitch-ish.yaml b/configs/v1_5-mix-medium-mitch-ish.yaml deleted file mode 100644 index a94c263ae..000000000 --- a/configs/v1_5-mix-medium-mitch-ish.yaml +++ /dev/null @@ -1,179 +0,0 @@ -run_name: v1_5-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: false - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 10.0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2e12T # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: null - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - ${oc.env:EVAL_DATA_PATH}/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - paths: ${path.glob:${oc.env:DATA_PATH}/v1_5-sample/gpt-neox-20b-pii-special/*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 diff --git a/configs/v1_5-mix-medium-s3.yaml b/configs/v1_5-mix-medium-s3.yaml deleted file mode 100644 index 9b4e537a0..000000000 --- a/configs/v1_5-mix-medium-s3.yaml +++ /dev/null @@ -1,645 +0,0 @@ -run_name: v1_5-mix-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 16 - n_layers: 29 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - flash_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: normal - -compile: - fullgraph: false - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/7b/${run_name} -save_overwrite: true -time_limit: null -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 476837 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block - precision: pure - -max_grad_norm: 1.0 -max_grad_norm_ratio: 1.5 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - # TODO: do we care about c4 and RP validation? We don't have these tokenized at the moment. - # - label: c4-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # - label: rp-validation - # subset_num_batches: 10 - # data: - # paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - # num_workers: 2 - # drop_last: true - # pin_memory: true - # persistent_workers: true - # prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # pin_memory: true - # prefetch_factor: 1 - # persistent_workers: false - # timeout: 0 - datasets: - 4chan-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy - c4_100_domains-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy - gab-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy - manosphere-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy - mc4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy - ptb-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy - twitterAEE-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 1 - persistent_workers: true - timeout: 0 - paths: - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-000-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-001-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-002-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-003-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-004-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-005-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-006-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-007-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-008-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-009-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-010-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-011-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-012-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-013-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-014-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-015-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-016-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-017-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-018-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-019-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-020-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-021-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-022-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-023-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-024-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-025-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-026-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-027-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-028-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-029-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-030-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-031-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-032-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-033-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-034-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-035-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-036-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-037-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-038-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-039-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-040-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-041-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-042-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-043-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-044-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-045-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-046-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-047-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-048-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-049-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-050-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-051-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-052-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-053-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-054-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-055-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-056-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-057-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-058-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-059-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-060-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-061-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-062-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-063-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-064-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-065-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-066-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-067-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-068-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-069-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-070-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-071-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-072-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-073-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-074-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-075-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-076-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-077-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-078-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-079-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-080-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-081-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-082-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-083-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-084-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-085-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-086-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-087-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-088-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-089-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-090-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-091-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-092-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-093-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-094-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-095-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-096-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-097-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-098-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-099-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-100-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-101-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-102-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-103-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-104-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-105-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-106-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-107-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-108-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-109-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-110-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-111-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-112-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-113-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-114-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-115-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-116-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-117-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-118-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-119-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-120-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-121-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-122-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-123-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-124-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-125-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-126-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-127-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-128-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-129-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-130-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-131-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-132-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-133-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-134-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-135-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-136-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-137-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-138-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-139-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-140-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-141-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-142-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-143-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-144-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-145-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-146-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-147-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-148-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-149-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-150-00003.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-151-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-152-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-153-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-154-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-155-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-156-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-157-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-158-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-159-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-160-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-161-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-162-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-163-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-164-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-165-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-166-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-167-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-168-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-169-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-170-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-171-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-172-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-173-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-174-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-175-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-176-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-177-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-178-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-179-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-180-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-181-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-182-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-183-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-184-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-185-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-186-00002.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special/part-187-00002.npy