From 89c8a0c80154b71ea600ab4916097b9e1a7fa423 Mon Sep 17 00:00:00 2001 From: markus583 Date: Thu, 28 Dec 2023 09:18:23 +0000 Subject: [PATCH] clean up cache right before training --- configs/xlmr_stratify_0.1_3layers_bs128.json | 2 +- configs/xlmr_stratify_0.1_3layers_bs256.json | 2 +- configs/xlmr_stratify_0.1_3layers_bs64.json | 2 +- configs/xlmr_stratify_0.1_3layers_highlr.json | 43 +++++++++++++++++++ configs/xlmr_stratify_0.1_3layers_no_aux.json | 2 +- configs/xlmr_stratify_0.1_3layers_nounks.json | 2 +- .../xlmr_stratify_0.1_3layers_shorter.json | 2 +- configs/xlmr_stratify_0.1_6layers.json | 43 +++++++++++++++++++ wtpsplit/train/train.py | 5 +++ wtpsplit/utils.py | 8 ++-- 10 files changed, 101 insertions(+), 10 deletions(-) create mode 100644 configs/xlmr_stratify_0.1_3layers_highlr.json create mode 100644 configs/xlmr_stratify_0.1_6layers.json diff --git a/configs/xlmr_stratify_0.1_3layers_bs128.json b/configs/xlmr_stratify_0.1_3layers_bs128.json index 2d3b8d56..3d608e22 100644 --- a/configs/xlmr_stratify_0.1_3layers_bs128.json +++ b/configs/xlmr_stratify_0.1_3layers_bs128.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-normal", + "output_dir": "xlmr-normal-bs-128", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 128, diff --git a/configs/xlmr_stratify_0.1_3layers_bs256.json b/configs/xlmr_stratify_0.1_3layers_bs256.json index 807ed62c..2ad92729 100644 --- a/configs/xlmr_stratify_0.1_3layers_bs256.json +++ b/configs/xlmr_stratify_0.1_3layers_bs256.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-normal", + "output_dir": "xlmr-normal-bs256", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 256, diff --git a/configs/xlmr_stratify_0.1_3layers_bs64.json b/configs/xlmr_stratify_0.1_3layers_bs64.json index 169d1325..40e4713f 100644 --- a/configs/xlmr_stratify_0.1_3layers_bs64.json +++ b/configs/xlmr_stratify_0.1_3layers_bs64.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-normal", + "output_dir": "xlmr-normal-bs64", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 64, diff --git a/configs/xlmr_stratify_0.1_3layers_highlr.json b/configs/xlmr_stratify_0.1_3layers_highlr.json new file mode 100644 index 00000000..bfa6fb7b --- /dev/null +++ b/configs/xlmr_stratify_0.1_3layers_highlr.json @@ -0,0 +1,43 @@ +{ + "model_name_or_path": "xlm-roberta-base", + "output_dir": "xlmr-normal-highlr", + "train_text_path": "data/sentence/train.parquet", + "valid_text_path": "data/sentence/valid.parquet", + "block_size": 512, + "use_bert": true, + "do_train": true, + "do_eval": true, + "evaluation_strategy": "steps", + "per_device_train_batch_size": 32, + "per_device_eval_batch_size": 32, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": 8, + "dataloader_num_workers": 4, + "preprocessing_num_workers": 32, + "learning_rate": 3e-4, + "save_strategy": "steps", + "fp16": false, + "max_steps": 2000000, + "save_steps": 100000, + "eval_steps": 5000, + "logging_steps": 50, + "report_to": "wandb", + "is_decoder": false, + "remove_unused_columns": false, + "lookahead": null, + "one_sample_per_line": false, + "do_sentence_training": true, + "do_auxiliary_training": true, + "warmup_steps": 5000, + "adapter_warmup_steps": 0, + "adapter_lr_multiplier": 1, + "ngram_order": 1, + "non_punctuation_sample_ratio": 0.1, + "prediction_loss_only": true, + "use_auxiliary": true, + "ddp_timeout": 3600, + "use_subwords": true, + "num_hidden_layers": 3, + "custom_punctuation_file": "punctuation_xlmr_unk.txt", + "log_level": "info" +} \ No newline at end of file diff --git a/configs/xlmr_stratify_0.1_3layers_no_aux.json b/configs/xlmr_stratify_0.1_3layers_no_aux.json index 750e93d1..1a863f2c 100644 --- a/configs/xlmr_stratify_0.1_3layers_no_aux.json +++ b/configs/xlmr_stratify_0.1_3layers_no_aux.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-normal", + "output_dir": "xlmr-normal-noaux", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 512, diff --git a/configs/xlmr_stratify_0.1_3layers_nounks.json b/configs/xlmr_stratify_0.1_3layers_nounks.json index f61c50b7..28fd936a 100644 --- a/configs/xlmr_stratify_0.1_3layers_nounks.json +++ b/configs/xlmr_stratify_0.1_3layers_nounks.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-normal-no_unks", + "output_dir": "xlmr-normal-nounks", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 512, diff --git a/configs/xlmr_stratify_0.1_3layers_shorter.json b/configs/xlmr_stratify_0.1_3layers_shorter.json index 056dee98..dad2296b 100644 --- a/configs/xlmr_stratify_0.1_3layers_shorter.json +++ b/configs/xlmr_stratify_0.1_3layers_shorter.json @@ -1,6 +1,6 @@ { "model_name_or_path": "xlm-roberta-base", - "output_dir": "xlmr-shorter", + "output_dir": "xlmr-normal-shorter", "train_text_path": "data/sentence/train.parquet", "valid_text_path": "data/sentence/valid.parquet", "block_size": 512, diff --git a/configs/xlmr_stratify_0.1_6layers.json b/configs/xlmr_stratify_0.1_6layers.json new file mode 100644 index 00000000..e8231a96 --- /dev/null +++ b/configs/xlmr_stratify_0.1_6layers.json @@ -0,0 +1,43 @@ +{ + "model_name_or_path": "xlm-roberta-base", + "output_dir": "xlmr-normal-6", + "train_text_path": "data/sentence/train.parquet", + "valid_text_path": "data/sentence/valid.parquet", + "block_size": 512, + "use_bert": true, + "do_train": true, + "do_eval": true, + "evaluation_strategy": "steps", + "per_device_train_batch_size": 32, + "per_device_eval_batch_size": 32, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": 8, + "dataloader_num_workers": 4, + "preprocessing_num_workers": 32, + "learning_rate": 1e-4, + "save_strategy": "steps", + "fp16": false, + "max_steps": 2000000, + "save_steps": 100000, + "eval_steps": 5000, + "logging_steps": 50, + "report_to": "wandb", + "is_decoder": false, + "remove_unused_columns": false, + "lookahead": null, + "one_sample_per_line": false, + "do_sentence_training": true, + "do_auxiliary_training": true, + "warmup_steps": 5000, + "adapter_warmup_steps": 0, + "adapter_lr_multiplier": 1, + "ngram_order": 1, + "non_punctuation_sample_ratio": 0.1, + "prediction_loss_only": true, + "use_auxiliary": true, + "ddp_timeout": 3600, + "use_subwords": true, + "num_hidden_layers": 6, + "custom_punctuation_file": "punctuation_xlmr_unk.txt", + "log_level": "info" +} \ No newline at end of file diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py index 4e453e3d..2be0715f 100644 --- a/wtpsplit/train/train.py +++ b/wtpsplit/train/train.py @@ -666,6 +666,11 @@ def maybe_pad(text): split="train", ) logger.info(f"Train dataset has {len(train_dataset)} examples.") + + with training_args.main_process_first(): + train_dataset.cleanup_cache_files() + valid_dataset.cleanup_cache_files() + logger.warning("Cleaned up cache files.") # print some samples from the dataset count = 0 diff --git a/wtpsplit/utils.py b/wtpsplit/utils.py index afd75b5b..f0986218 100644 --- a/wtpsplit/utils.py +++ b/wtpsplit/utils.py @@ -102,11 +102,11 @@ def get_subword_label_dict(label_args, tokenizer): n_unks = 0 # Map auxiliary characters to token IDs with labels - logger.warning(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.") + logger.info(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.") for i, c in enumerate(Constants.PUNCTUATION_CHARS): token_id = tokenizer.convert_tokens_to_ids(c) label_dict[token_id] = 1 + Constants.AUX_OFFSET + i - logger.warning( + logger.info( f"auxiliary character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded: {tokenizer.decode([token_id])}" ) if token_id == tokenizer.unk_token_id: @@ -118,8 +118,8 @@ def get_subword_label_dict(label_args, tokenizer): for c in label_args.newline_chars: token_id = tokenizer.convert_tokens_to_ids(c) label_dict[token_id] = 1 + Constants.NEWLINE_INDEX - logger.warning(f"newline character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded:") - logger.warning(f"{tokenizer.decode([token_id])}") + logger.info(f"newline character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded:") + logger.info(f"{tokenizer.decode([token_id])}") return label_dict