From 89c8a0c80154b71ea600ab4916097b9e1a7fa423 Mon Sep 17 00:00:00 2001
From: markus583 <markus.frohmann@gmail.com>
Date: Thu, 28 Dec 2023 09:18:23 +0000
Subject: [PATCH] clean up cache right before training

---
 configs/xlmr_stratify_0.1_3layers_bs128.json  |  2 +-
 configs/xlmr_stratify_0.1_3layers_bs256.json  |  2 +-
 configs/xlmr_stratify_0.1_3layers_bs64.json   |  2 +-
 configs/xlmr_stratify_0.1_3layers_highlr.json | 43 +++++++++++++++++++
 configs/xlmr_stratify_0.1_3layers_no_aux.json |  2 +-
 configs/xlmr_stratify_0.1_3layers_nounks.json |  2 +-
 .../xlmr_stratify_0.1_3layers_shorter.json    |  2 +-
 configs/xlmr_stratify_0.1_6layers.json        | 43 +++++++++++++++++++
 wtpsplit/train/train.py                       |  5 +++
 wtpsplit/utils.py                             |  8 ++--
 10 files changed, 101 insertions(+), 10 deletions(-)
 create mode 100644 configs/xlmr_stratify_0.1_3layers_highlr.json
 create mode 100644 configs/xlmr_stratify_0.1_6layers.json

diff --git a/configs/xlmr_stratify_0.1_3layers_bs128.json b/configs/xlmr_stratify_0.1_3layers_bs128.json
index 2d3b8d56..3d608e22 100644
--- a/configs/xlmr_stratify_0.1_3layers_bs128.json
+++ b/configs/xlmr_stratify_0.1_3layers_bs128.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-normal",
+    "output_dir": "xlmr-normal-bs-128",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 128,
diff --git a/configs/xlmr_stratify_0.1_3layers_bs256.json b/configs/xlmr_stratify_0.1_3layers_bs256.json
index 807ed62c..2ad92729 100644
--- a/configs/xlmr_stratify_0.1_3layers_bs256.json
+++ b/configs/xlmr_stratify_0.1_3layers_bs256.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-normal",
+    "output_dir": "xlmr-normal-bs256",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 256,
diff --git a/configs/xlmr_stratify_0.1_3layers_bs64.json b/configs/xlmr_stratify_0.1_3layers_bs64.json
index 169d1325..40e4713f 100644
--- a/configs/xlmr_stratify_0.1_3layers_bs64.json
+++ b/configs/xlmr_stratify_0.1_3layers_bs64.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-normal",
+    "output_dir": "xlmr-normal-bs64",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 64,
diff --git a/configs/xlmr_stratify_0.1_3layers_highlr.json b/configs/xlmr_stratify_0.1_3layers_highlr.json
new file mode 100644
index 00000000..bfa6fb7b
--- /dev/null
+++ b/configs/xlmr_stratify_0.1_3layers_highlr.json
@@ -0,0 +1,43 @@
+{
+    "model_name_or_path": "xlm-roberta-base",
+    "output_dir": "xlmr-normal-highlr",
+    "train_text_path": "data/sentence/train.parquet",
+    "valid_text_path": "data/sentence/valid.parquet",
+    "block_size": 512,
+    "use_bert": true,
+    "do_train": true,
+    "do_eval": true,
+    "evaluation_strategy": "steps",
+    "per_device_train_batch_size": 32,
+    "per_device_eval_batch_size": 32,
+    "gradient_accumulation_steps": 2,
+    "eval_accumulation_steps": 8,
+    "dataloader_num_workers": 4,
+    "preprocessing_num_workers": 32,
+    "learning_rate": 3e-4,
+    "save_strategy": "steps",
+    "fp16": false,
+    "max_steps": 2000000,
+    "save_steps": 100000,
+    "eval_steps": 5000,
+    "logging_steps": 50,
+    "report_to": "wandb",
+    "is_decoder": false,
+    "remove_unused_columns": false,
+    "lookahead": null,
+    "one_sample_per_line": false,
+    "do_sentence_training": true,
+    "do_auxiliary_training": true,
+    "warmup_steps": 5000,
+    "adapter_warmup_steps": 0,
+    "adapter_lr_multiplier": 1,
+    "ngram_order": 1,
+    "non_punctuation_sample_ratio": 0.1,
+    "prediction_loss_only": true,
+    "use_auxiliary": true,
+    "ddp_timeout": 3600,
+    "use_subwords": true,
+    "num_hidden_layers": 3,
+    "custom_punctuation_file": "punctuation_xlmr_unk.txt",
+    "log_level": "info"
+}
\ No newline at end of file
diff --git a/configs/xlmr_stratify_0.1_3layers_no_aux.json b/configs/xlmr_stratify_0.1_3layers_no_aux.json
index 750e93d1..1a863f2c 100644
--- a/configs/xlmr_stratify_0.1_3layers_no_aux.json
+++ b/configs/xlmr_stratify_0.1_3layers_no_aux.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-normal",
+    "output_dir": "xlmr-normal-noaux",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 512,
diff --git a/configs/xlmr_stratify_0.1_3layers_nounks.json b/configs/xlmr_stratify_0.1_3layers_nounks.json
index f61c50b7..28fd936a 100644
--- a/configs/xlmr_stratify_0.1_3layers_nounks.json
+++ b/configs/xlmr_stratify_0.1_3layers_nounks.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-normal-no_unks",
+    "output_dir": "xlmr-normal-nounks",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 512,
diff --git a/configs/xlmr_stratify_0.1_3layers_shorter.json b/configs/xlmr_stratify_0.1_3layers_shorter.json
index 056dee98..dad2296b 100644
--- a/configs/xlmr_stratify_0.1_3layers_shorter.json
+++ b/configs/xlmr_stratify_0.1_3layers_shorter.json
@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlmr-shorter",
+    "output_dir": "xlmr-normal-shorter",
     "train_text_path": "data/sentence/train.parquet",
     "valid_text_path": "data/sentence/valid.parquet",
     "block_size": 512,
diff --git a/configs/xlmr_stratify_0.1_6layers.json b/configs/xlmr_stratify_0.1_6layers.json
new file mode 100644
index 00000000..e8231a96
--- /dev/null
+++ b/configs/xlmr_stratify_0.1_6layers.json
@@ -0,0 +1,43 @@
+{
+    "model_name_or_path": "xlm-roberta-base",
+    "output_dir": "xlmr-normal-6",
+    "train_text_path": "data/sentence/train.parquet",
+    "valid_text_path": "data/sentence/valid.parquet",
+    "block_size": 512,
+    "use_bert": true,
+    "do_train": true,
+    "do_eval": true,
+    "evaluation_strategy": "steps",
+    "per_device_train_batch_size": 32,
+    "per_device_eval_batch_size": 32,
+    "gradient_accumulation_steps": 2,
+    "eval_accumulation_steps": 8,
+    "dataloader_num_workers": 4,
+    "preprocessing_num_workers": 32,
+    "learning_rate": 1e-4,
+    "save_strategy": "steps",
+    "fp16": false,
+    "max_steps": 2000000,
+    "save_steps": 100000,
+    "eval_steps": 5000,
+    "logging_steps": 50,
+    "report_to": "wandb",
+    "is_decoder": false,
+    "remove_unused_columns": false,
+    "lookahead": null,
+    "one_sample_per_line": false,
+    "do_sentence_training": true,
+    "do_auxiliary_training": true,
+    "warmup_steps": 5000,
+    "adapter_warmup_steps": 0,
+    "adapter_lr_multiplier": 1,
+    "ngram_order": 1,
+    "non_punctuation_sample_ratio": 0.1,
+    "prediction_loss_only": true,
+    "use_auxiliary": true,
+    "ddp_timeout": 3600,
+    "use_subwords": true,
+    "num_hidden_layers": 6,
+    "custom_punctuation_file": "punctuation_xlmr_unk.txt",
+    "log_level": "info"
+}
\ No newline at end of file
diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py
index 4e453e3d..2be0715f 100644
--- a/wtpsplit/train/train.py
+++ b/wtpsplit/train/train.py
@@ -666,6 +666,11 @@ def maybe_pad(text):
         split="train",
     )
     logger.info(f"Train dataset has {len(train_dataset)} examples.")
+    
+    with training_args.main_process_first():
+        train_dataset.cleanup_cache_files()
+        valid_dataset.cleanup_cache_files()
+        logger.warning("Cleaned up cache files.")
 
     # print some samples from the dataset
     count = 0
diff --git a/wtpsplit/utils.py b/wtpsplit/utils.py
index afd75b5b..f0986218 100644
--- a/wtpsplit/utils.py
+++ b/wtpsplit/utils.py
@@ -102,11 +102,11 @@ def get_subword_label_dict(label_args, tokenizer):
 
     n_unks = 0
     # Map auxiliary characters to token IDs with labels
-    logger.warning(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.")
+    logger.info(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.")
     for i, c in enumerate(Constants.PUNCTUATION_CHARS):
         token_id = tokenizer.convert_tokens_to_ids(c)
         label_dict[token_id] = 1 + Constants.AUX_OFFSET + i
-        logger.warning(
+        logger.info(
             f"auxiliary character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded: {tokenizer.decode([token_id])}"
         )
         if token_id == tokenizer.unk_token_id:
@@ -118,8 +118,8 @@ def get_subword_label_dict(label_args, tokenizer):
     for c in label_args.newline_chars:
         token_id = tokenizer.convert_tokens_to_ids(c)
         label_dict[token_id] = 1 + Constants.NEWLINE_INDEX
-        logger.warning(f"newline character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded:")
-        logger.warning(f"{tokenizer.decode([token_id])}")
+        logger.info(f"newline character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded:")
+        logger.info(f"{tokenizer.decode([token_id])}")
 
     return label_dict